AjayP13's picture
Super-squash branch 'main' using huggingface_hub
442540c verified
{"created_at": "2025-08-15T04:35:53.042908", "global_step": 2000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.2952599083847839, "acc_stderr,none": 0.004552272447071703, "acc_norm,none": 0.3202549292969528, "acc_norm_stderr,none": 0.0046562089515414335}, "mmlu": {"acc,none": 0.2299529981484119, "acc_stderr,none": 0.003544717019338066, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24229543039319873, "acc_stderr,none": 0.006244290130481456, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733393}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.23817186997103315, "acc_stderr,none": 0.0076198116748855535, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23499361430395913, "acc_stderr,none": 0.015162024152278452}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02380518652488814}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.023157468308559342}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21904452388690282, "acc_stderr,none": 0.007450686873223054, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246794}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1908256880733945, "acc_stderr,none": 0.016847676400091105}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21408182683158897, "acc_stderr,none": 0.007289227411736005, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686936}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277716}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.02103733150526289}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1870967741935484, "acc_stderr,none": 0.02218571009225225}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15763546798029557, "acc_stderr,none": 0.025639014131172404}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.717, "acc_stderr,none": 0.014251810906481739, "acc_norm,none": 0.621, "acc_norm_stderr,none": 0.01534909100222535}}
{"created_at": "2025-08-15T06:15:11.990673", "global_step": 4000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3296156144194384, "acc_stderr,none": 0.004691128722535484, "acc_norm,none": 0.3895638319059948, "acc_norm_stderr,none": 0.00486654742235557}, "mmlu": {"acc,none": 0.23614869676684233, "acc_stderr,none": 0.0035805291467481265, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25377258235919237, "acc_stderr,none": 0.006337762888412889, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790606}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.31862745098039214, "acc_stderr,none": 0.032702871814820816}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.02875679962965834}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2066115702479339, "acc_stderr,none": 0.03695980128098824}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.03259177392742178}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.023618678310069356}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2245810055865922, "acc_stderr,none": 0.013956803666544643}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.02255244778047804}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2716049382716049, "acc_stderr,none": 0.02474862449053737}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27444589308996087, "acc_stderr,none": 0.011397043163078154}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.0330140594698725}, "mmlu_other": {"acc,none": 0.22207917605407146, "acc_stderr,none": 0.007450884283110314, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.18867924528301888, "acc_stderr,none": 0.02407999513006224}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.030631145539198823}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.26905829596412556, "acc_stderr,none": 0.029763779406874972}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.027421007295392912}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.015302380123542103}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888146}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22695035460992907, "acc_stderr,none": 0.024987106365642962}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.15808823529411764, "acc_stderr,none": 0.02216146260806852}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.22424439389015274, "acc_stderr,none": 0.007518543718760655, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489361}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.029126522834586808}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178253}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.19230769230769232, "acc_stderr,none": 0.019982347208637296}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.01776597865232755}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082395}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167418}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940588}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17959183673469387, "acc_stderr,none": 0.024573293589585637}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772443}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_stem": {"acc,none": 0.2353314303837615, "acc_stderr,none": 0.007555375457067481, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.03944624162501116}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.033911609343436025}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.0368452949177471}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2297872340425532, "acc_stderr,none": 0.027501752944412424}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135303}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21957671957671956, "acc_stderr,none": 0.021320018599770355}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22580645161290322, "acc_stderr,none": 0.02378557788418101}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.0307127300709826}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355154}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "sciq": {"alias": "sciq", "acc,none": 0.782, "acc_stderr,none": 0.013063179040595296, "acc_norm,none": 0.705, "acc_norm_stderr,none": 0.014428554438445514}}
{"created_at": "2025-08-15T07:56:13.032182", "global_step": 6000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883523}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.34385580561641105, "acc_stderr,none": 0.004740229212473453, "acc_norm,none": 0.4213304122684724, "acc_norm_stderr,none": 0.0049276318064775575}, "mmlu": {"acc,none": 0.23287281014100555, "acc_stderr,none": 0.003560609871597591, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24165781083953242, "acc_stderr,none": 0.006240346689185181, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604672}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139404}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604257}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2869198312236287, "acc_stderr,none": 0.029443773022594693}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.03520893951097653}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.27607361963190186, "acc_stderr,none": 0.0351238528370505}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.02289408248992599}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2324022346368715, "acc_stderr,none": 0.014125968754673387}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18006430868167203, "acc_stderr,none": 0.02182342285774494}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.02389187954195961}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24837027379400262, "acc_stderr,none": 0.011035212598034494}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.24589636305117477, "acc_stderr,none": 0.007712674390789875, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.02634148037111835}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.28699551569506726, "acc_stderr,none": 0.030360379710291947}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.043546310772605956}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.029996951858349476}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23116219667943805, "acc_stderr,none": 0.015075523238101077}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.20915032679738563, "acc_stderr,none": 0.02328768531233481}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.02635806569888059}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683229}, "mmlu_social_sciences": {"acc,none": 0.2216444588885278, "acc_stderr,none": 0.007469622790997359, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.19696969696969696, "acc_stderr,none": 0.028335609732463348}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.12953367875647667, "acc_stderr,none": 0.024233532297758733}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.02056753956724679}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.1638655462184874, "acc_stderr,none": 0.024044054940440495}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618688}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23366013071895425, "acc_stderr,none": 0.017119158496044503}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2897959183673469, "acc_stderr,none": 0.02904308868330432}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.02947525023601717}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.21788772597526165, "acc_stderr,none": 0.007337043114542628, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.03972552884785137}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.034370793441061365}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.036845294917747094}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.1568627450980392, "acc_stderr,none": 0.03618664819936246}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.02785125297388977}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.02113285918275444}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.02413763242933771}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03010833071801162}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.02504044387700069}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.13425925925925927, "acc_stderr,none": 0.0232512775905459}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "sciq": {"alias": "sciq", "acc,none": 0.815, "acc_stderr,none": 0.01228519132638669, "acc_norm,none": 0.72, "acc_norm_stderr,none": 0.014205696104091496}}
{"created_at": "2025-08-15T09:36:30.934894", "global_step": 8000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634101}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.36128261302529374, "acc_stderr,none": 0.00479390492240189, "acc_norm,none": 0.4458275243975304, "acc_norm_stderr,none": 0.0049604083621332395}, "mmlu": {"acc,none": 0.23629112662013957, "acc_stderr,none": 0.003578736000130443, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2410201912858661, "acc_stderr,none": 0.006233031714437232, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.035670166752768614}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.035679697722680474}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604243}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2911392405063291, "acc_stderr,none": 0.029571601065753364}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.03849856098794088}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286773}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.022497230190967558}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480757}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294674}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23598435462842243, "acc_stderr,none": 0.010844802669662694}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.24686192468619247, "acc_stderr,none": 0.007706071873712333, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.0271342916287417}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.35874439461883406, "acc_stderr,none": 0.03219079200419995}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646033}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834949}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23499361430395913, "acc_stderr,none": 0.015162024152278448}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.20261437908496732, "acc_stderr,none": 0.023015446877985662}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.21631205673758866, "acc_stderr,none": 0.0245617205605628}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.0355092018568963}, "mmlu_social_sciences": {"acc,none": 0.2382190445238869, "acc_stderr,none": 0.007671445597608901, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.029620227874790486}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.02925282329180363}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25137614678899084, "acc_stderr,none": 0.018599206360287415}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.018054027458815194}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644287}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17551020408163265, "acc_stderr,none": 0.024352800722970015}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_stem": {"acc,none": 0.21693625118934348, "acc_stderr,none": 0.0073341904797528355, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.14, "acc_stderr,none": 0.0348735088019777}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17037037037037037, "acc_stderr,none": 0.032477811859955935}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325004}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3021276595744681, "acc_stderr,none": 0.030017554471880554}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.19310344827586207, "acc_stderr,none": 0.032894455221274016}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.021037331505262893}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1870967741935484, "acc_stderr,none": 0.022185710092252255}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444434}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.02592887613276611}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.03257847384436778}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18981481481481483, "acc_stderr,none": 0.026744714834691916}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "sciq": {"alias": "sciq", "acc,none": 0.816, "acc_stderr,none": 0.012259457340938577, "acc_norm,none": 0.742, "acc_norm_stderr,none": 0.013842963108656603}}
{"created_at": "2025-08-15T11:17:19.084568", "global_step": 10000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127208}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3719378609838678, "acc_stderr,none": 0.004823341569605419, "acc_norm,none": 0.46504680342561244, "acc_norm_stderr,none": 0.004977574188421321}, "mmlu": {"acc,none": 0.23878364905284147, "acc_stderr,none": 0.0035937820855881123, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2501594048884166, "acc_stderr,none": 0.006317261463633039, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501936}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507416}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.03259177392742177}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.024476994076247333}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2508038585209003, "acc_stderr,none": 0.024619771956697165}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2808641975308642, "acc_stderr,none": 0.02500646975579921}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.242503259452412, "acc_stderr,none": 0.010946570966348787}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.033014059469872487}, "mmlu_other": {"acc,none": 0.25426456388799484, "acc_stderr,none": 0.00779425337172766, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.026749899771241238}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34080717488789236, "acc_stderr,none": 0.03181149747055359}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674057}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2541507024265645, "acc_stderr,none": 0.015569254692045764}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.20915032679738563, "acc_stderr,none": 0.02328768531233481}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22695035460992907, "acc_stderr,none": 0.02498710636564297}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.024060599423487424}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3072289156626506, "acc_stderr,none": 0.03591566797824664}, "mmlu_social_sciences": {"acc,none": 0.22684432889177772, "acc_stderr,none": 0.007543790731278747, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.0414243971948936}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18686868686868688, "acc_stderr,none": 0.02777253333421899}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681725}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.18205128205128204, "acc_stderr,none": 0.0195652367829309}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.18907563025210083, "acc_stderr,none": 0.02543511943810535}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24036697247706423, "acc_stderr,none": 0.01832060732096407}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.037276735755969195}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_stem": {"acc,none": 0.2182048842372344, "acc_stderr,none": 0.0073550957579941595, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174021}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.0327900040631005}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.039505818611799616}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.021411684393694196}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.027719315709614775}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.02549753263960955}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.16556291390728478, "acc_stderr,none": 0.0303481834103036}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.025130453652268455}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.03834241021419072}, "sciq": {"alias": "sciq", "acc,none": 0.812, "acc_stderr,none": 0.012361586015103754, "acc_norm,none": 0.73, "acc_norm_stderr,none": 0.014046255632633915}}
{"created_at": "2025-08-15T12:59:48.593337", "global_step": 12000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19656019656019655, "acc_stderr,none": 0.01137743977396399}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38129854610635333, "acc_stderr,none": 0.004847129907908661, "acc_norm,none": 0.4833698466440948, "acc_norm_stderr,none": 0.004987020679861266}, "mmlu": {"acc,none": 0.23529411764705882, "acc_stderr,none": 0.003575522235145229, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24293304994686504, "acc_stderr,none": 0.006252128087844268, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924317}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516304}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.044531975073749834}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934722}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.02344582627654555}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.022552447780478026}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2716049382716049, "acc_stderr,none": 0.024748624490537382}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23272490221642764, "acc_stderr,none": 0.010792595553888496}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.2471837785645317, "acc_stderr,none": 0.007721455979706499, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.026616482980501715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.19653179190751446, "acc_stderr,none": 0.030299574664788137}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3452914798206278, "acc_stderr,none": 0.031911001928357934}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928313}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22860791826309068, "acc_stderr,none": 0.015016884698539887}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.024630048979824775}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226673}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.02352924218519311}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.034605799075530276}, "mmlu_social_sciences": {"acc,none": 0.22424439389015274, "acc_stderr,none": 0.007519089807066041, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.02655220782821529}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.02977866303775295}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246794}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.02626502460827589}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.018075750241633153}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.03547771004159465}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.02947525023601719}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.22296225816682524, "acc_stderr,none": 0.007401266006496696, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536934}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.03391160934343602}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03708284662416544}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.32340425531914896, "acc_stderr,none": 0.030579442773610337}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.034559302019248124}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.021765961672154523}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1967741935483871, "acc_stderr,none": 0.022616409420742025}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.16748768472906403, "acc_stderr,none": 0.026273086047535414}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.18543046357615894, "acc_stderr,none": 0.031732843842942865}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355164}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976235}, "sciq": {"alias": "sciq", "acc,none": 0.82, "acc_stderr,none": 0.01215515313551196, "acc_norm,none": 0.73, "acc_norm_stderr,none": 0.014046255632633915}}
{"created_at": "2025-08-15T14:46:55.635226", "global_step": 14000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1891891891891892, "acc_stderr,none": 0.011213159711868589}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.39384584744074885, "acc_stderr,none": 0.004876028037941936, "acc_norm,none": 0.49950209121688904, "acc_norm_stderr,none": 0.004989778937380362}, "mmlu": {"acc,none": 0.23807149978635522, "acc_stderr,none": 0.003593434786987252, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23910733262486716, "acc_stderr,none": 0.00621580965000297, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03453131801885415}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.028379449451588663}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955938}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.045879047413018105}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.27607361963190186, "acc_stderr,none": 0.03512385283705051}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19935691318327975, "acc_stderr,none": 0.022691033780549656}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733396}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2320730117340287, "acc_stderr,none": 0.010782046665905197}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.03301405946987251}, "mmlu_other": {"acc,none": 0.24267782426778242, "acc_stderr,none": 0.007700526314114285, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.20754716981132076, "acc_stderr,none": 0.024959918028911267}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.03214737302029469}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2600896860986547, "acc_stderr,none": 0.029442495585857473}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.028286324075564407}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26053639846743293, "acc_stderr,none": 0.015696008563807096}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.024954184324879905}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.02551873104953777}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.025187786660227262}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.23074423139421515, "acc_stderr,none": 0.00759481853221161, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.042270544512322}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.19696969696969696, "acc_stderr,none": 0.028335609732463355}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.030748905363909895}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2282051282051282, "acc_stderr,none": 0.02127839386358628}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.19747899159663865, "acc_stderr,none": 0.025859164122051463}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21834862385321102, "acc_stderr,none": 0.017712600528722738}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728745}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23366013071895425, "acc_stderr,none": 0.017119158496044503}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579157}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.031157150869355558}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.2391373295274342, "acc_stderr,none": 0.007587093984040328, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.03944624162501116}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17105263157894737, "acc_stderr,none": 0.0306436070716771}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.03827052357950756}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.044405219061793254}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.19310344827586207, "acc_stderr,none": 0.032894455221274016}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.023266512213730554}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1967741935483871, "acc_stderr,none": 0.022616409420742018}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.21674876847290642, "acc_stderr,none": 0.028990331252516235}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712166}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18981481481481483, "acc_stderr,none": 0.026744714834691916}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "sciq": {"alias": "sciq", "acc,none": 0.818, "acc_stderr,none": 0.01220758063766215, "acc_norm,none": 0.746, "acc_norm_stderr,none": 0.013772206565168544}}
{"created_at": "2025-08-15T16:48:38.463134", "global_step": 16000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20065520065520065, "acc_stderr,none": 0.01146601146601155}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40420235012945627, "acc_stderr,none": 0.0048973407933143795, "acc_norm,none": 0.5131447918741286, "acc_norm_stderr,none": 0.004988056789119671}, "mmlu": {"acc,none": 0.24732944025067655, "acc_stderr,none": 0.0036395531680241005, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24654622741764082, "acc_stderr,none": 0.006281847356036724, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.03317505930009179}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.028867431449849313}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302871}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854933}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26993865030674846, "acc_stderr,none": 0.03487825168497892}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2090032154340836, "acc_stderr,none": 0.02309314039837422}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2993827160493827, "acc_stderr,none": 0.025483115601195455}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23402868318122555, "acc_stderr,none": 0.010813585552659677}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.034678266857038245}, "mmlu_other": {"acc,none": 0.2648857418731896, "acc_stderr,none": 0.007915183765568806, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756191}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.033450369167889925}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.32286995515695066, "acc_stderr,none": 0.031381476375754995}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.044532548363264673}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523418}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2771392081736909, "acc_stderr,none": 0.016005636294122428}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.023929155517351287}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.026469036818590634}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.02472311040767707}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511116}, "mmlu_social_sciences": {"acc,none": 0.23269418264543387, "acc_stderr,none": 0.007619935269452136, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436695}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.02805779167298902}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817244}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2, "acc_stderr,none": 0.020280805062535722}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882385}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24403669724770644, "acc_stderr,none": 0.0184152863514164}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.01770453165325007}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.043091187099464585}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866767}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.02970528405677244}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_stem": {"acc,none": 0.24548049476688868, "acc_stderr,none": 0.007663138180155176, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073464}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.03355045304882923}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3276595744680851, "acc_stderr,none": 0.030683020843231004}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577657}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489614}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184408}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18981481481481483, "acc_stderr,none": 0.02674471483469191}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "sciq": {"alias": "sciq", "acc,none": 0.831, "acc_stderr,none": 0.011856625977890122, "acc_norm,none": 0.744, "acc_norm_stderr,none": 0.0138077751522342}}
{"created_at": "2025-08-15T18:27:00.570695", "global_step": 18000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.011552714477876666}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4075881298546106, "acc_stderr,none": 0.0049038158859832795, "acc_norm,none": 0.5262895837482573, "acc_norm_stderr,none": 0.004982879340691411}, "mmlu": {"acc,none": 0.25153112092294544, "acc_stderr,none": 0.0036552718529940893, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25313496280552605, "acc_stderr,none": 0.0063331449381263585, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.03512207412302054}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.0347769116216366}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.029331162294251728}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460302}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.043913262867240704}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04557239513497751}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.034624199316156234}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2745664739884393, "acc_stderr,none": 0.02402774515526502}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.025329888171900926}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.01092649610203495}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03188578017686398}, "mmlu_other": {"acc,none": 0.2729320888316704, "acc_stderr,none": 0.00796799182560412, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.027134291628741713}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3811659192825112, "acc_stderr,none": 0.03259625118416828}, "mmlu_management": {"alias": " - management", "acc,none": 0.3300970873786408, "acc_stderr,none": 0.04656147110012351}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2656449553001277, "acc_stderr,none": 0.015794302487888715}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888146}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.02624492034984301}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.025187786660227262}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.036293353299478595}, "mmlu_social_sciences": {"acc,none": 0.2349691257718557, "acc_stderr,none": 0.007642486856709401, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.04096985139843669}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.030313710538198896}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681725}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371393}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863804}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24220183486238533, "acc_stderr,none": 0.01836817630659862}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.03498149385462471}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.017812676542320657}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644287}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546195}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409217}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.24421186171899778, "acc_stderr,none": 0.007642792475285517, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.03554180368025689}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.20833333333333334, "acc_stderr,none": 0.03396116205845335}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.042207736591714534}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534446}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727771}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525214}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.02509189237885928}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617736}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.03257847384436777}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.02649191472735518}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "sciq": {"alias": "sciq", "acc,none": 0.835, "acc_stderr,none": 0.011743632866916166, "acc_norm,none": 0.743, "acc_norm_stderr,none": 0.013825416526895024}}
{"created_at": "2025-08-15T19:49:42.089698", "global_step": 20000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20802620802620803, "acc_stderr,none": 0.01162075957565238}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40908185620394344, "acc_stderr,none": 0.0049065958579167564, "acc_norm,none": 0.5300736904999004, "acc_norm_stderr,none": 0.004980747448813314}, "mmlu": {"acc,none": 0.24932345819683804, "acc_stderr,none": 0.003646698089873586, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2554729011689692, "acc_stderr,none": 0.006352374503639486, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790606}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.029331162294251728}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25316455696202533, "acc_stderr,none": 0.028304657943035303}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.04205953933884123}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04668408033024931}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.024818350129436593}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24115755627009647, "acc_stderr,none": 0.024296594034763426}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.025407197798890162}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.010996156635142692}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.031581495393387324}, "mmlu_other": {"acc,none": 0.2581268104280657, "acc_stderr,none": 0.007840136115066515, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.027134291628741713}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.0309528902177499}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34977578475336324, "acc_stderr,none": 0.03200736719484504}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.02860595370200424}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24904214559386972, "acc_stderr,none": 0.015464676163395967}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294254}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.036643147772880864}, "mmlu_social_sciences": {"acc,none": 0.23756906077348067, "acc_stderr,none": 0.00766799581997994, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.04372748290278008}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.029857515673386396}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.02925282329180363}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246787}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2689075630252101, "acc_stderr,none": 0.028801392193631276}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25137614678899084, "acc_stderr,none": 0.018599206360287415}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.19083969465648856, "acc_stderr,none": 0.03446513350752599}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167407}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579157}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.24294322867110688, "acc_stderr,none": 0.007629982901718654, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.03502553170678318}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952924}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03476590104304134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.040925639582376536}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.03097669299853445}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135303}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491842}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.26129032258064516, "acc_stderr,none": 0.024993053397764815}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.23645320197044334, "acc_stderr,none": 0.029896114291733562}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.025928876132766118}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355174}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.040598672469526864}, "sciq": {"alias": "sciq", "acc,none": 0.838, "acc_stderr,none": 0.011657267771304419, "acc_norm,none": 0.754, "acc_norm_stderr,none": 0.013626065817750638}}
{"created_at": "2025-08-15T21:27:54.705347", "global_step": 22000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634107}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41037641904003186, "acc_stderr,none": 0.004908967278222491, "acc_norm,none": 0.5309699263095001, "acc_norm_stderr,none": 0.0049802004518516695}, "mmlu": {"acc,none": 0.25096140150975643, "acc_stderr,none": 0.003653848591805596, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25844845908607866, "acc_stderr,none": 0.00637721648686158, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139406}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598028}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.042664163633521664}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04557239513497752}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.033220157957767414}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.024818350129436596}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.27009646302250806, "acc_stderr,none": 0.025218040373410622}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.30864197530864196, "acc_stderr,none": 0.025702640260603753}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23989569752281617, "acc_stderr,none": 0.010906282617981655}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209194}, "mmlu_other": {"acc,none": 0.25716124879304797, "acc_stderr,none": 0.007828025213758973, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.02661648298050171}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.0309528902177499}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34080717488789236, "acc_stderr,none": 0.03181149747055359}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928313}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24521072796934865, "acc_stderr,none": 0.015384352284543932}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.02392915551735129}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290406}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22426470588235295, "acc_stderr,none": 0.025336848563332372}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3373493975903614, "acc_stderr,none": 0.03680783690727581}, "mmlu_social_sciences": {"acc,none": 0.23854403639909003, "acc_stderr,none": 0.007683921716446521, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.04404556157374768}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.02912652283458682}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.030975436386845426}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.01822407811729909}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.03498149385462472}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.01747948700136476}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.043091187099464585}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090496}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296014}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_stem": {"acc,none": 0.2457976530288614, "acc_stderr,none": 0.007648454362636882, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036844}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03476590104304134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036625}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.042207736591714534}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3659574468085106, "acc_stderr,none": 0.031489558297455304}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.02271746789770861}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.02489246917246283}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617732}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.03479185572599657}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.024837173518242394}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "sciq": {"alias": "sciq", "acc,none": 0.84, "acc_stderr,none": 0.011598902298689004, "acc_norm,none": 0.751, "acc_norm_stderr,none": 0.01368160027870231}}