Upload nemotron_actual_1T_exp/metrics.eval.jsonl with huggingface_hub
Browse files
nemotron_actual_1T_exp/metrics.eval.jsonl
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"created_at": "2025-09-04T13:01:11.563798", "global_step": 30000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19656019656019655, "acc_stderr,none": 0.011377439773963993}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.42421828321051586, "acc_stderr,none": 0.004932137126625403, "acc_norm,none": 0.5595498904600678, "acc_norm_stderr,none": 0.004954265595373453}, "mmlu": {"acc,none": 0.23515168779376158, "acc_stderr,none": 0.003572599208696966, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2410201912858661, "acc_stderr,none": 0.006231264409995499, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235173}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482995}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695053}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2911392405063291, "acc_stderr,none": 0.029571601065753378}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516301}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.04133119440243838}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.20245398773006135, "acc_stderr,none": 0.031570650789119026}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.022698657167855713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18006430868167203, "acc_stderr,none": 0.02182342285774495}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.024659685185967284}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.010926496102034954}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.251689732861281, "acc_stderr,none": 0.007768415154404844, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.025447863825108625}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483098}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909281}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.32286995515695066, "acc_stderr,none": 0.03138147637575499}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.029996951858349483}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.015302380123542094}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242557}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537762}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288086}, "mmlu_social_sciences": {"acc,none": 0.2297692557686058, "acc_stderr,none": 0.007571330501289166, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481425}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1717171717171717, "acc_stderr,none": 0.026869716187429914}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.029778663037752954}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.021107730127244}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1981651376146789, "acc_stderr,none": 0.017090573804217888}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167418}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.02688214492230774}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3034825870646766, "acc_stderr,none": 0.03251006816458618}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_stem": {"acc,none": 0.21535045987947987, "acc_stderr,none": 0.0073133671357311705, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.032790004063100515}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.03216600808802269}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.1568627450980392, "acc_stderr,none": 0.03618664819936245}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.028346963777162452}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.02084229093011467}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2032258064516129, "acc_stderr,none": 0.022891687984554952}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444455}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655116}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.03336767086567978}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291518}, "sciq": {"alias": "sciq", "acc,none": 0.891, "acc_stderr,none": 0.00985982840703718, "acc_norm,none": 0.859, "acc_norm_stderr,none": 0.011010914595992445}}
|
2 |
+
{"created_at": "2025-09-05T18:48:27.765051", "global_step": 60000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21294021294021295, "acc_stderr,none": 0.011720679449797593}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4389563831905995, "acc_stderr,none": 0.004952454721934775, "acc_norm,none": 0.5854411471818363, "acc_norm_stderr,none": 0.004916388962142329}, "mmlu": {"acc,none": 0.2545933627688364, "acc_stderr,none": 0.003671146936067525, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2463336875664187, "acc_stderr,none": 0.006285752413591295, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790608}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350194}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.03849856098794087}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2572347266881029, "acc_stderr,none": 0.024826171289250888}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2808641975308642, "acc_stderr,none": 0.025006469755799208}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25097783572359844, "acc_stderr,none": 0.011073730299187214}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.1871345029239766, "acc_stderr,none": 0.02991312723236804}, "mmlu_other": {"acc,none": 0.276472481493402, "acc_stderr,none": 0.007999114035375895, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.027134291628741713}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.37668161434977576, "acc_stderr,none": 0.03252113489929188}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27458492975734355, "acc_stderr,none": 0.015959829933084035}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.025058503316958154}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.02699219917306436}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3373493975903614, "acc_stderr,none": 0.0368078369072758}, "mmlu_social_sciences": {"acc,none": 0.2557686057848554, "acc_stderr,none": 0.007862296374849795, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893596}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.029126522834586825}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.030748905363909902}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24102564102564103, "acc_stderr,none": 0.0216855466653332}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863807}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.01812566918086149}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.01759348689536683}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.33636363636363636, "acc_stderr,none": 0.04525393596302506}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2897959183673469, "acc_stderr,none": 0.029043088683304335}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.29850746268656714, "acc_stderr,none": 0.032357437893550424}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.24421186171899778, "acc_stderr,none": 0.0076369880757132386, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.033550453048829254}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.034370793441061365}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774708}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179964}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.030783736757745643}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27741935483870966, "acc_stderr,none": 0.025470196835900055}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.26108374384236455, "acc_stderr,none": 0.03090379695211447}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165085}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844072}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.03336767086567977}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.02541642838876747}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.899, "acc_stderr,none": 0.009533618929340988, "acc_norm,none": 0.866, "acc_norm_stderr,none": 0.01077776229836968}}
|
3 |
+
{"created_at": "2025-09-05T18:55:10.284570", "global_step": 90000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.011704202814200254}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.45508862776339376, "acc_stderr,none": 0.004969611554685393, "acc_norm,none": 0.5998805018920533, "acc_norm_stderr,none": 0.004889210628907982}, "mmlu": {"acc,none": 0.2647058823529412, "acc_stderr,none": 0.0037139088351715037, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27481402763018065, "acc_stderr,none": 0.006502064052511117, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.033954900208561116}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511783}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.03149328104507956}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2975206611570248, "acc_stderr,none": 0.04173349148083499}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.03623089915724146}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.024946792225272314}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.01431099954796147}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31189710610932475, "acc_stderr,none": 0.026311858071854155}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.32098765432098764, "acc_stderr,none": 0.025976566010862734}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.273142112125163, "acc_stderr,none": 0.011380150567830398}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.034678266857038266}, "mmlu_other": {"acc,none": 0.2526552944962987, "acc_stderr,none": 0.0077843068355961, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483099}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23318385650224216, "acc_stderr,none": 0.028380391147094716}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.028911208802749486}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27330779054916987, "acc_stderr,none": 0.015936681062628556}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826517}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16911764705882354, "acc_stderr,none": 0.022770868010113025}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.03410646614071857}, "mmlu_social_sciences": {"acc,none": 0.2612934676633084, "acc_stderr,none": 0.007869302622493945, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.030313710538198906}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.030748905363909878}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.02110773012724399}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.02626502460827589}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.017765978652327572}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823965}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.018152871051538805}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.038313051408846034}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.37551020408163266, "acc_stderr,none": 0.031001209039894836}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3482587064676617, "acc_stderr,none": 0.03368787466115459}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.4, "acc_stderr,none": 0.04923659639173309}, "mmlu_stem": {"acc,none": 0.26482714874722485, "acc_stderr,none": 0.007849386997464108, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.04094376269996794}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998904}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.03514697467862388}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.027136349602424066}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.037245636197746325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525214}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462833}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.032257994762334846}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371216}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.037101857261199946}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.02813968944485967}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "sciq": {"alias": "sciq", "acc,none": 0.914, "acc_stderr,none": 0.008870325962594766, "acc_norm,none": 0.887, "acc_norm_stderr,none": 0.010016552866696853}}
|
4 |
+
{"created_at": "2025-09-05T18:58:34.494330", "global_step": 120000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.01158688187917783}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4642501493726349, "acc_stderr,none": 0.004977010670436546, "acc_norm,none": 0.6177056363274248, "acc_norm_stderr,none": 0.004849547819134488}, "mmlu": {"acc,none": 0.25580401652186296, "acc_stderr,none": 0.0036762235110630054, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24484590860786398, "acc_stderr,none": 0.006268603442149771, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.042163702135578345}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.032568666616811015}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.03114557065948678}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.02765215314415928}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.03520893951097653}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252626}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.03259177392742177}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.21676300578034682, "acc_stderr,none": 0.02218347766841285}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2379421221864952, "acc_stderr,none": 0.024185150647818697}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2716049382716049, "acc_stderr,none": 0.024748624490537375}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.01106415102716543}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209194}, "mmlu_other": {"acc,none": 0.26874798841326036, "acc_stderr,none": 0.007954707183851124, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.026480357179895674}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.033450369167889904}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.31196581196581197, "acc_stderr,none": 0.030351527323344948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2554278416347382, "acc_stderr,none": 0.015594955384455765}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826514}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460994}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22794117647058823, "acc_stderr,none": 0.025483081468029804}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3072289156626506, "acc_stderr,none": 0.03591566797824663}, "mmlu_social_sciences": {"acc,none": 0.2729931751706207, "acc_stderr,none": 0.008010572437039662, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365897}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565317}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.25384615384615383, "acc_stderr,none": 0.022066054378726253}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.28991596638655465, "acc_stderr,none": 0.029472485833136094}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22568807339449543, "acc_stderr,none": 0.017923087667803043}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677697}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.01781267654232066}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.363265306122449, "acc_stderr,none": 0.030789051139030806}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31840796019900497, "acc_stderr,none": 0.03294118479054095}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_stem": {"acc,none": 0.24262607040913417, "acc_stderr,none": 0.007615036005667959, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.03885004245800254}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03317672787533157}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080342}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993176}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28936170212765955, "acc_stderr,none": 0.029644006577009618}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.037800192304380156}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.020006075494524406}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462857}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.030108330718011625}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.17407407407407408, "acc_stderr,none": 0.02311859603355185}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.17880794701986755, "acc_stderr,none": 0.031287448506007225}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.030058202704309846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.909, "acc_stderr,none": 0.009099549538400243, "acc_norm,none": 0.871, "acc_norm_stderr,none": 0.010605256784796575}}
|
5 |
+
{"created_at": "2025-09-05T19:08:56.148992", "global_step": 150000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18755118755118755, "acc_stderr,none": 0.011175783964114732}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46932881896036643, "acc_stderr,none": 0.00498038457553539, "acc_norm,none": 0.6241784505078669, "acc_norm_stderr,none": 0.00483344455633863}, "mmlu": {"acc,none": 0.24996439253667568, "acc_stderr,none": 0.0036489802109832417, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2450584484590861, "acc_stderr,none": 0.006272654613710996, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102147}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.032568666616811015}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.027652153144159256}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.04026187527591207}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961447}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2765273311897106, "acc_stderr,none": 0.02540383297817962}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.29012345679012347, "acc_stderr,none": 0.025251173936495026}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23728813559322035, "acc_stderr,none": 0.010865436690780255}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.27132281943997427, "acc_stderr,none": 0.007947764848536928, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.027134291628741702}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.19653179190751446, "acc_stderr,none": 0.030299574664788147}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.39461883408071746, "acc_stderr,none": 0.03280400504755291}, "mmlu_management": {"alias": " - management", "acc,none": 0.2524271844660194, "acc_stderr,none": 0.04301250399690877}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27330779054916987, "acc_stderr,none": 0.01593668106262856}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.024630048979824768}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.02657786094330785}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.30120481927710846, "acc_stderr,none": 0.035716092300534796}, "mmlu_social_sciences": {"acc,none": 0.23951901202469938, "acc_stderr,none": 0.007689735570101815, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.029620227874790458}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23846153846153847, "acc_stderr,none": 0.02160629449464773}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.02702543349888238}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.018272575810231867}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.017667841612379002}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.33636363636363636, "acc_stderr,none": 0.04525393596302505}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.0250002560395462}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_stem": {"acc,none": 0.24643196955280686, "acc_stderr,none": 0.007669700344581446, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.038201699145179055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.03279000406310049}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.15, "acc_stderr,none": 0.0358870281282637}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.04389869956808778}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.02937917046412483}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577658}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.025091892378859275}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.03161856335358609}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.02564410863926764}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987054}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.02541642838876748}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "sciq": {"alias": "sciq", "acc,none": 0.92, "acc_stderr,none": 0.008583336977753651, "acc_norm,none": 0.894, "acc_norm_stderr,none": 0.009739551265785138}}
|
6 |
+
{"created_at": "2025-09-06T15:21:44.665848", "global_step": 180000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883527}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47769368651663013, "acc_stderr,none": 0.004984813391016207, "acc_norm,none": 0.6415056761601274, "acc_norm_stderr,none": 0.004785781979354874}, "mmlu": {"acc,none": 0.2766699900299103, "acc_stderr,none": 0.0037654350962153644, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27523910733262485, "acc_stderr,none": 0.006503331618427093, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924318}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923393}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25316455696202533, "acc_stderr,none": 0.028304657943035286}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.041032038305145124}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.03623089915724147}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.33236994219653176, "acc_stderr,none": 0.02536116874968822}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3086816720257235, "acc_stderr,none": 0.02623696588115327}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.02622964917882116}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25945241199478486, "acc_stderr,none": 0.011195262076350316}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3391812865497076, "acc_stderr,none": 0.03631053496488904}, "mmlu_other": {"acc,none": 0.271000965561635, "acc_stderr,none": 0.007957352312101065, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.026341480371118352}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.03345036916788991}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.22869955156950672, "acc_stderr,none": 0.028188240046929193}, "mmlu_management": {"alias": " - management", "acc,none": 0.13592233009708737, "acc_stderr,none": 0.03393295729761012}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.029343114798094472}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2835249042145594, "acc_stderr,none": 0.016117318166832272}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.32679738562091504, "acc_stderr,none": 0.02685729466328142}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.27205882352941174, "acc_stderr,none": 0.027033041151681456}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.03550920185689629}, "mmlu_social_sciences": {"acc,none": 0.27786805329866754, "acc_stderr,none": 0.008022201151749085, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.03775205013583638}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.03182155050916646}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.27692307692307694, "acc_stderr,none": 0.022688042352424994}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.028205545033277726}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22201834862385322, "acc_stderr,none": 0.017818849564796617}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3816793893129771, "acc_stderr,none": 0.04260735157644561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594726}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.036942843353377997}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.40408163265306124, "acc_stderr,none": 0.03141470802586589}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.34328358208955223, "acc_stderr,none": 0.03357379665433431}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_stem": {"acc,none": 0.28322232794164287, "acc_stderr,none": 0.008016595929491624, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34814814814814815, "acc_stderr,none": 0.041153246103369526}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34868421052631576, "acc_stderr,none": 0.0387813988879761}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.026947483121496217}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3448275862068966, "acc_stderr,none": 0.03960933549451208}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633356}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239952}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0317852971064275}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.02708037281514566}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3101851851851852, "acc_stderr,none": 0.031546962856566295}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "sciq": {"alias": "sciq", "acc,none": 0.938, "acc_stderr,none": 0.007629823996280312, "acc_norm,none": 0.913, "acc_norm_stderr,none": 0.008916866630745894}}
|
7 |
+
{"created_at": "2025-09-06T15:30:59.513312", "global_step": 210000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127208}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.49193387771360286, "acc_stderr,none": 0.004989132075598773, "acc_norm,none": 0.6537542322246565, "acc_norm_stderr,none": 0.004748003276466224}, "mmlu": {"acc,none": 0.2795185870958553, "acc_stderr,none": 0.0037699224398551117, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.273538788522848, "acc_stderr,none": 0.006487022399304413, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276862}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3151515151515151, "acc_stderr,none": 0.0362773057502241}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145628}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3140495867768595, "acc_stderr,none": 0.04236964753041017}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.034624199316156234}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3208092485549133, "acc_stderr,none": 0.025131000233647897}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31189710610932475, "acc_stderr,none": 0.026311858071854155}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.35185185185185186, "acc_stderr,none": 0.026571483480719978}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2666232073011734, "acc_stderr,none": 0.011293836031612138}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.0340105262010409}, "mmlu_other": {"acc,none": 0.26906984229159964, "acc_stderr,none": 0.007916523394738913, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.026616482980501715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047874}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.24663677130044842, "acc_stderr,none": 0.028930413120910874}, "mmlu_management": {"alias": " - management", "acc,none": 0.13592233009708737, "acc_stderr,none": 0.03393295729761013}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.029996951858349476}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.29118773946360155, "acc_stderr,none": 0.0162460870697014}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.32679738562091504, "acc_stderr,none": 0.02685729466328142}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.15808823529411764, "acc_stderr,none": 0.02216146260806852}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.03629335329947859}, "mmlu_social_sciences": {"acc,none": 0.2885927851803705, "acc_stderr,none": 0.008115408293027706, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.0303137105381989}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24870466321243523, "acc_stderr,none": 0.0311958408777003}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.022489389793654824}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.02772206549336126}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24403669724770644, "acc_stderr,none": 0.01841528635141641}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.4122137404580153, "acc_stderr,none": 0.043171711948702556}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.0184334276494019}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.040693063197213775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4163265306122449, "acc_stderr,none": 0.03155782816556165}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3383084577114428, "acc_stderr,none": 0.033455630703391935}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001974}, "mmlu_stem": {"acc,none": 0.28988265144307007, "acc_stderr,none": 0.00803778896743868, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542126}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.362962962962963, "acc_stderr,none": 0.04153948404742398}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3618421052631579, "acc_stderr,none": 0.03910525752849725}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.15, "acc_stderr,none": 0.035887028128263734}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.046550104113196177}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.35, "acc_stderr,none": 0.04793724854411018}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.026947483121496224}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3793103448275862, "acc_stderr,none": 0.04043461861916747}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.0236369759961018}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.02518900666021238}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.04793724854411018}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844072}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.03822746937658753}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19907407407407407, "acc_stderr,none": 0.02723229846269023}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3392857142857143, "acc_stderr,none": 0.04493949068613539}, "sciq": {"alias": "sciq", "acc,none": 0.929, "acc_stderr,none": 0.00812557844248792, "acc_norm,none": 0.893, "acc_norm_stderr,none": 0.009779910359847169}}
|
8 |
+
{"created_at": "2025-09-06T16:31:44.582701", "global_step": 240000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.011704202814200272}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.504282015534754, "acc_stderr,none": 0.004989598426249549, "acc_norm,none": 0.676458872734515, "acc_norm_stderr,none": 0.004668710689192433}, "mmlu": {"acc,none": 0.2807292408488819, "acc_stderr,none": 0.003779176865989652, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25738575982996814, "acc_stderr,none": 0.006366752380338899, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3253968253968254, "acc_stderr,none": 0.041905964388711366}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.036085410115739666}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.03166009679399813}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955927}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.0401910747255735}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3092485549132948, "acc_stderr,none": 0.02488314057007175}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2861736334405145, "acc_stderr,none": 0.02567025924218894}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005716}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2542372881355932, "acc_stderr,none": 0.011121129007840673}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.18128654970760233, "acc_stderr,none": 0.029547741687640027}, "mmlu_other": {"acc,none": 0.28644995172191823, "acc_stderr,none": 0.008097953339504683, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421255}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2981132075471698, "acc_stderr,none": 0.028152837942493868}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.03583901754736412}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.22869955156950672, "acc_stderr,none": 0.02818824004692919}, "mmlu_management": {"alias": " - management", "acc,none": 0.34951456310679613, "acc_stderr,none": 0.04721188506097173}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.028605953702004253}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.015889888362560486}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.02678745311190654}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729906}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3014705882352941, "acc_stderr,none": 0.027875982114273168}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3674698795180723, "acc_stderr,none": 0.03753267402120574}, "mmlu_social_sciences": {"acc,none": 0.3132921676958076, "acc_stderr,none": 0.00829717885257246, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.033586181457325226}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.35233160621761656, "acc_stderr,none": 0.03447478286414357}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.29743589743589743, "acc_stderr,none": 0.02317740813146594}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.03017680828897434}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3357798165137615, "acc_stderr,none": 0.020248081396752937}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3816793893129771, "acc_stderr,none": 0.04260735157644561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.20915032679738563, "acc_stderr,none": 0.016453399332279323}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.43673469387755104, "acc_stderr,none": 0.031751952375833226}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3283582089552239, "acc_stderr,none": 0.033206858897443244}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.4, "acc_stderr,none": 0.04923659639173309}, "mmlu_stem": {"acc,none": 0.2781477957500793, "acc_stderr,none": 0.007985877689329995, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.03885004245800255}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.03827052357950756}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.04533838195929776}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3021276595744681, "acc_stderr,none": 0.030017554471880554}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.022717467897708624}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.02509189237885928}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.030712730070982592}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073835}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.03114144782353604}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "sciq": {"alias": "sciq", "acc,none": 0.948, "acc_stderr,none": 0.007024624213817138, "acc_norm,none": 0.913, "acc_norm_stderr,none": 0.008916866630745871}}
|
9 |
+
{"created_at": "2025-09-07T08:33:52.969504", "global_step": 270000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127208}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.5177255526787492, "acc_stderr,none": 0.0049866448947431235, "acc_norm,none": 0.6919936267675761, "acc_norm_stderr,none": 0.004607256752931888}, "mmlu": {"acc,none": 0.28863409770687937, "acc_stderr,none": 0.0038114931750842566, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2884165781083953, "acc_stderr,none": 0.006593423091777149, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.31746031746031744, "acc_stderr,none": 0.04163453031302859}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511783}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.03077855467869326}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598042}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.044811377559424694}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.04133119440243839}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.035590395316173425}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.33236994219653176, "acc_stderr,none": 0.025361168749688218}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23687150837988827, "acc_stderr,none": 0.014219570788103986}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31511254019292606, "acc_stderr,none": 0.026385273703464496}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.31790123456790126, "acc_stderr,none": 0.025910063528240875}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2920469361147327, "acc_stderr,none": 0.01161334913627182}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.0356507967070831}, "mmlu_other": {"acc,none": 0.28355326681686516, "acc_stderr,none": 0.008069823074337325, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.28679245283018867, "acc_stderr,none": 0.02783491252754406}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.03345036916788991}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2600896860986547, "acc_stderr,none": 0.029442495585857487}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.041858325989283164}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.029343114798094472}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2988505747126437, "acc_stderr,none": 0.01636925681509312}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3431372549019608, "acc_stderr,none": 0.02718449890994162}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.027187127011503803}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3373493975903614, "acc_stderr,none": 0.03680783690727581}, "mmlu_social_sciences": {"acc,none": 0.2853428664283393, "acc_stderr,none": 0.008085362101410445, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.17543859649122806, "acc_stderr,none": 0.035779548139483704}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.031911782267135466}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.03051611137147601}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.25384615384615383, "acc_stderr,none": 0.022066054378726257}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.027886828078380554}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25504587155963304, "acc_stderr,none": 0.018688500856535825}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.37404580152671757, "acc_stderr,none": 0.04243869242230524}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.018152871051538812}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.40816326530612246, "acc_stderr,none": 0.03146465712827423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3383084577114428, "acc_stderr,none": 0.03345563070339194}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_stem": {"acc,none": 0.29717729146844274, "acc_stderr,none": 0.008114219614411873, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34814814814814815, "acc_stderr,none": 0.041153246103369526}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351585}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536975}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993178}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.029101290698386705}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.36551724137931035, "acc_stderr,none": 0.04013124195424386}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.29354838709677417, "acc_stderr,none": 0.0259060870213193}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.31527093596059114, "acc_stderr,none": 0.03269080871970186}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110175}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085626}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.36423841059602646, "acc_stderr,none": 0.03929111781242741}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.027920963147993666}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3392857142857143, "acc_stderr,none": 0.04493949068613539}, "sciq": {"alias": "sciq", "acc,none": 0.949, "acc_stderr,none": 0.006960420062571415, "acc_norm,none": 0.907, "acc_norm_stderr,none": 0.009188875634996657}}
|
10 |
+
{"created_at": "2025-09-07T08:42:30.139116", "global_step": 300000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22604422604422605, "acc_stderr,none": 0.011974981909575612}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.5216092411870146, "acc_stderr,none": 0.004985119183640761, "acc_norm,none": 0.6958773152758415, "acc_norm_stderr,none": 0.00459094683972717}, "mmlu": {"acc,none": 0.3098561458481698, "acc_stderr,none": 0.0038878012326674276, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.30456960680127526, "acc_stderr,none": 0.0066926980733252464, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04285714285714281}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3696969696969697, "acc_stderr,none": 0.03769430314512568}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647553}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31645569620253167, "acc_stderr,none": 0.030274974880218967}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.044811377559424694}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3374233128834356, "acc_stderr,none": 0.03714908409935574}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3468208092485549, "acc_stderr,none": 0.02562472399403046}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.33440514469453375, "acc_stderr,none": 0.026795422327893944}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.02622964917882116}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30182529335071706, "acc_stderr,none": 0.01172435051810589}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.034886477134579215}, "mmlu_other": {"acc,none": 0.31670421628580625, "acc_stderr,none": 0.008326858366900305, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3471698113207547, "acc_stderr,none": 0.029300101705549655}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3352601156069364, "acc_stderr,none": 0.03599586301247077}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229143}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.029996951858349486}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.34227330779054915, "acc_stderr,none": 0.01696703176641363}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.027475969910660952}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460994}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21691176470588236, "acc_stderr,none": 0.02503584522771126}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.03740059382029319}, "mmlu_social_sciences": {"acc,none": 0.32044198895027626, "acc_stderr,none": 0.008353396040564934, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481404}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.25906735751295334, "acc_stderr,none": 0.031618779179354094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2794871794871795, "acc_stderr,none": 0.022752388839776823}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2605042016806723, "acc_stderr,none": 0.028510251512341926}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30642201834862387, "acc_stderr,none": 0.01976551722045852}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.4122137404580153, "acc_stderr,none": 0.04317171194870254}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3006535947712418, "acc_stderr,none": 0.018550634502952964}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.0449429086625209}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4326530612244898, "acc_stderr,none": 0.031717528240626645}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.373134328358209, "acc_stderr,none": 0.034198326081760065}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.52, "acc_stderr,none": 0.05021167315686781}, "mmlu_stem": {"acc,none": 0.3006660323501427, "acc_stderr,none": 0.008140298813164146, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04171654161354543}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34210526315789475, "acc_stderr,none": 0.03860731599316092}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.03873958714149352}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28936170212765955, "acc_stderr,none": 0.029644006577009618}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.36551724137931035, "acc_stderr,none": 0.04013124195424386}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2751322751322751, "acc_stderr,none": 0.023000086859068642}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3, "acc_stderr,none": 0.026069362295335137}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.31527093596059114, "acc_stderr,none": 0.03269080871970186}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117317}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2851851851851852, "acc_stderr,none": 0.027528599210340492}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33774834437086093, "acc_stderr,none": 0.038615575462551684}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.029157522184605596}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.04521829902833587}, "sciq": {"alias": "sciq", "acc,none": 0.952, "acc_stderr,none": 0.0067632641336666625, "acc_norm,none": 0.92, "acc_norm_stderr,none": 0.008583336977753653}}
|