{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9965010496850945, "eval_steps": 100, "global_step": 89, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 719.0248603820801, "epoch": 0.01119664100769769, "grad_norm": 5.308396339416504, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7022594679147005, "reward_std": 0.11167733068577945, "rewards/accuracy_reward": 0.4343112222850323, "rewards/semantic_entropy_math_reward": 0.7022594679147005, "rewards/total_entropy_reward": 1.2448364309966564, "step": 1 }, { "completion_length": 697.6454010009766, "epoch": 0.02239328201539538, "grad_norm": 3.310378313064575, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6712827906012535, "reward_std": 0.1011071486864239, "rewards/accuracy_reward": 0.352040808647871, "rewards/semantic_entropy_math_reward": 0.6712827868759632, "rewards/total_entropy_reward": 1.305861696600914, "step": 2 }, { "completion_length": 701.3826370239258, "epoch": 0.03358992302309307, "grad_norm": 4.717713356018066, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7075437195599079, "reward_std": 0.07735519809648395, "rewards/accuracy_reward": 0.4126275437884033, "rewards/semantic_entropy_math_reward": 0.7075437270104885, "rewards/total_entropy_reward": 1.2551036067306995, "step": 3 }, { "completion_length": 692.5803489685059, "epoch": 0.04478656403079076, "grad_norm": 4.699649810791016, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6926020309329033, "reward_std": 0.09249642631039023, "rewards/accuracy_reward": 0.4196428433060646, "rewards/semantic_entropy_math_reward": 0.6926020495593548, "rewards/total_entropy_reward": 1.2794943004846573, "step": 4 }, { "completion_length": 643.8430976867676, "epoch": 0.05598320503848846, "grad_norm": 7.118838310241699, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7232142686843872, "reward_std": 0.09167159674689174, "rewards/accuracy_reward": 0.41326529532670975, "rewards/semantic_entropy_math_reward": 0.7232142575085163, "rewards/total_entropy_reward": 1.2130939476191998, "step": 5 }, { "completion_length": 711.6734580993652, "epoch": 0.06717984604618614, "grad_norm": 4.829713344573975, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6625364199280739, "reward_std": 0.1073409270029515, "rewards/accuracy_reward": 0.37436223588883877, "rewards/semantic_entropy_math_reward": 0.6625364273786545, "rewards/total_entropy_reward": 1.320713147521019, "step": 6 }, { "completion_length": 676.0541915893555, "epoch": 0.07837648705388384, "grad_norm": 4.374125003814697, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7006195187568665, "reward_std": 0.09755392652004957, "rewards/accuracy_reward": 0.4247448882088065, "rewards/semantic_entropy_math_reward": 0.7006195187568665, "rewards/total_entropy_reward": 1.2528423443436623, "step": 7 }, { "completion_length": 712.5401573181152, "epoch": 0.08957312806158152, "grad_norm": 6.558979034423828, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.633381923660636, "reward_std": 0.10544528882019222, "rewards/accuracy_reward": 0.3463010173290968, "rewards/semantic_entropy_math_reward": 0.6333819553256035, "rewards/total_entropy_reward": 1.3834920637309551, "step": 8 }, { "completion_length": 711.6345539093018, "epoch": 0.10076976906927922, "grad_norm": 7.621776103973389, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6800291426479816, "reward_std": 0.10248321201652288, "rewards/accuracy_reward": 0.39604590833187103, "rewards/semantic_entropy_math_reward": 0.6800291538238525, "rewards/total_entropy_reward": 1.2968212738633156, "step": 9 }, { "completion_length": 674.1779174804688, "epoch": 0.11196641007697691, "grad_norm": 7.523331165313721, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6800291538238525, "reward_std": 0.11024063220247626, "rewards/accuracy_reward": 0.411352033726871, "rewards/semantic_entropy_math_reward": 0.6800291500985622, "rewards/total_entropy_reward": 1.2904038280248642, "step": 10 }, { "completion_length": 741.4017696380615, "epoch": 0.1231630510846746, "grad_norm": 6.336475372314453, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6415816266089678, "reward_std": 0.10356245189905167, "rewards/accuracy_reward": 0.3858418306335807, "rewards/semantic_entropy_math_reward": 0.6415816303342581, "rewards/total_entropy_reward": 1.363376997411251, "step": 11 }, { "completion_length": 692.8437347412109, "epoch": 0.13435969209237228, "grad_norm": 7.185462474822998, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7002550885081291, "reward_std": 0.10777880600653589, "rewards/accuracy_reward": 0.43048468697816133, "rewards/semantic_entropy_math_reward": 0.7002551108598709, "rewards/total_entropy_reward": 1.2493381686508656, "step": 12 }, { "completion_length": 704.6638870239258, "epoch": 0.14555633310007, "grad_norm": 17.740726470947266, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.673833817243576, "reward_std": 0.09210015833377838, "rewards/accuracy_reward": 0.43558672815561295, "rewards/semantic_entropy_math_reward": 0.6738338358700275, "rewards/total_entropy_reward": 1.3185552880167961, "step": 13 }, { "completion_length": 679.9374847412109, "epoch": 0.15675297410776767, "grad_norm": 9.5872802734375, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7075437121093273, "reward_std": 0.09575178450904787, "rewards/accuracy_reward": 0.40624999068677425, "rewards/semantic_entropy_math_reward": 0.7075437419116497, "rewards/total_entropy_reward": 1.237540539354086, "step": 14 }, { "completion_length": 731.2404136657715, "epoch": 0.16794961511546536, "grad_norm": 4.687215805053711, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6375728864222765, "reward_std": 0.108485147356987, "rewards/accuracy_reward": 0.3762755021452904, "rewards/semantic_entropy_math_reward": 0.6375729013234377, "rewards/total_entropy_reward": 1.3838917911052704, "step": 15 }, { "completion_length": 700.0522842407227, "epoch": 0.17914625612316304, "grad_norm": 8.418819427490234, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6547011584043503, "reward_std": 0.10533164534717798, "rewards/accuracy_reward": 0.40624999161809683, "rewards/semantic_entropy_math_reward": 0.6547011733055115, "rewards/total_entropy_reward": 1.3433180004358292, "step": 16 }, { "completion_length": 684.034423828125, "epoch": 0.19034289713086075, "grad_norm": 7.032946586608887, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.664905235171318, "reward_std": 0.0974241562653333, "rewards/accuracy_reward": 0.3832908095791936, "rewards/semantic_entropy_math_reward": 0.6649052500724792, "rewards/total_entropy_reward": 1.3396263718605042, "step": 17 }, { "completion_length": 677.9177227020264, "epoch": 0.20153953813855843, "grad_norm": 15.37868595123291, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6975218504667282, "reward_std": 0.09150373586453497, "rewards/accuracy_reward": 0.45153060369193554, "rewards/semantic_entropy_math_reward": 0.69752187281847, "rewards/total_entropy_reward": 1.262941613793373, "step": 18 }, { "completion_length": 683.5771484375, "epoch": 0.21273617914625612, "grad_norm": 36.41266632080078, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7057215645909309, "reward_std": 0.09910683194175363, "rewards/accuracy_reward": 0.4368622377514839, "rewards/semantic_entropy_math_reward": 0.7057215794920921, "rewards/total_entropy_reward": 1.2459207847714424, "step": 19 }, { "completion_length": 688.8073768615723, "epoch": 0.22393282015395383, "grad_norm": 27.405065536499023, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7097303122282028, "reward_std": 0.09203298413194716, "rewards/accuracy_reward": 0.4598214225843549, "rewards/semantic_entropy_math_reward": 0.7097303308546543, "rewards/total_entropy_reward": 1.2417229264974594, "step": 20 }, { "completion_length": 736.3092956542969, "epoch": 0.2351294611616515, "grad_norm": 19.091941833496094, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6541544962674379, "reward_std": 0.0986680502537638, "rewards/accuracy_reward": 0.4228316266089678, "rewards/semantic_entropy_math_reward": 0.6541545186191797, "rewards/total_entropy_reward": 1.3527532257139683, "step": 21 }, { "completion_length": 709.4795722961426, "epoch": 0.2463261021693492, "grad_norm": 14.640216827392578, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6836734637618065, "reward_std": 0.10867427196353674, "rewards/accuracy_reward": 0.46301019564270973, "rewards/semantic_entropy_math_reward": 0.6836734749376774, "rewards/total_entropy_reward": 1.2770788073539734, "step": 22 }, { "completion_length": 681.6218032836914, "epoch": 0.2575227431770469, "grad_norm": 13.511483192443848, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6967929936945438, "reward_std": 0.09672949556261301, "rewards/accuracy_reward": 0.44579080305993557, "rewards/semantic_entropy_math_reward": 0.6967929899692535, "rewards/total_entropy_reward": 1.2772413976490498, "step": 23 }, { "completion_length": 705.6811141967773, "epoch": 0.26871938418474456, "grad_norm": 15.547083854675293, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6461370252072811, "reward_std": 0.1031385101377964, "rewards/accuracy_reward": 0.3985969265922904, "rewards/semantic_entropy_math_reward": 0.6461370065808296, "rewards/total_entropy_reward": 1.3622385039925575, "step": 24 }, { "completion_length": 731.4566116333008, "epoch": 0.27991602519244224, "grad_norm": 8.434446334838867, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6233600564301014, "reward_std": 0.08827707398450002, "rewards/accuracy_reward": 0.3494897885248065, "rewards/semantic_entropy_math_reward": 0.6233600713312626, "rewards/total_entropy_reward": 1.4261119738221169, "step": 25 }, { "completion_length": 685.5197486877441, "epoch": 0.29111266620014, "grad_norm": 12.586360931396484, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6765670366585255, "reward_std": 0.1049469755962491, "rewards/accuracy_reward": 0.412627543322742, "rewards/semantic_entropy_math_reward": 0.6765670329332352, "rewards/total_entropy_reward": 1.3074346259236336, "step": 26 }, { "completion_length": 711.061840057373, "epoch": 0.30230930720783766, "grad_norm": 9.39128303527832, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6521501429378986, "reward_std": 0.1099298931658268, "rewards/accuracy_reward": 0.40624999068677425, "rewards/semantic_entropy_math_reward": 0.6521501652896404, "rewards/total_entropy_reward": 1.3500806987285614, "step": 27 }, { "completion_length": 707.5612106323242, "epoch": 0.31350594821553535, "grad_norm": 8.279614448547363, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6692784316837788, "reward_std": 0.10041180578991771, "rewards/accuracy_reward": 0.36096938233822584, "rewards/semantic_entropy_math_reward": 0.6692784205079079, "rewards/total_entropy_reward": 1.3163355849683285, "step": 28 }, { "completion_length": 675.4157962799072, "epoch": 0.32470258922323303, "grad_norm": 13.983796119689941, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7091836705803871, "reward_std": 0.09643913782201707, "rewards/accuracy_reward": 0.478316318243742, "rewards/semantic_entropy_math_reward": 0.7091836743056774, "rewards/total_entropy_reward": 1.2364819720387459, "step": 29 }, { "completion_length": 723.9189872741699, "epoch": 0.3358992302309307, "grad_norm": 13.94961166381836, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.614431481808424, "reward_std": 0.12222768180072308, "rewards/accuracy_reward": 0.3533163210377097, "rewards/semantic_entropy_math_reward": 0.614431481808424, "rewards/total_entropy_reward": 1.4096959978342056, "step": 30 }, { "completion_length": 721.4604339599609, "epoch": 0.3470958712386284, "grad_norm": 7.932522773742676, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6603498533368111, "reward_std": 0.09537219302728772, "rewards/accuracy_reward": 0.3992346851155162, "rewards/semantic_entropy_math_reward": 0.6603498607873917, "rewards/total_entropy_reward": 1.3325160779058933, "step": 31 }, { "completion_length": 702.3788185119629, "epoch": 0.3582925122463261, "grad_norm": 9.096883773803711, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6829446069896221, "reward_std": 0.09760210802778602, "rewards/accuracy_reward": 0.40816325694322586, "rewards/semantic_entropy_math_reward": 0.682944618165493, "rewards/total_entropy_reward": 1.303523451089859, "step": 32 }, { "completion_length": 700.5924644470215, "epoch": 0.3694891532540238, "grad_norm": 3.891953229904175, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7055393569171429, "reward_std": 0.10629667551256716, "rewards/accuracy_reward": 0.45982141979038715, "rewards/semantic_entropy_math_reward": 0.7055393569171429, "rewards/total_entropy_reward": 1.2391385585069656, "step": 33 }, { "completion_length": 716.7027950286865, "epoch": 0.3806857942617215, "grad_norm": 12.71116828918457, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6698250789195299, "reward_std": 0.09906701685395092, "rewards/accuracy_reward": 0.40369897056370974, "rewards/semantic_entropy_math_reward": 0.6698250807821751, "rewards/total_entropy_reward": 1.3271235637366772, "step": 34 }, { "completion_length": 769.0414352416992, "epoch": 0.3918824352694192, "grad_norm": 9.855749130249023, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6481414064764977, "reward_std": 0.10145364608615637, "rewards/accuracy_reward": 0.3743622349575162, "rewards/semantic_entropy_math_reward": 0.6481414288282394, "rewards/total_entropy_reward": 1.3565044924616814, "step": 35 }, { "completion_length": 722.349479675293, "epoch": 0.40307907627711687, "grad_norm": 5.131447792053223, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6638119556009769, "reward_std": 0.09085186710581183, "rewards/accuracy_reward": 0.4170918297022581, "rewards/semantic_entropy_math_reward": 0.6638119481503963, "rewards/total_entropy_reward": 1.350273534655571, "step": 36 }, { "completion_length": 703.8832778930664, "epoch": 0.41427571728481455, "grad_norm": 7.057519435882568, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.690597664564848, "reward_std": 0.09433076065033674, "rewards/accuracy_reward": 0.43686223309487104, "rewards/semantic_entropy_math_reward": 0.6905976496636868, "rewards/total_entropy_reward": 1.2801040560007095, "step": 37 }, { "completion_length": 725.2799606323242, "epoch": 0.42547235829251223, "grad_norm": 7.460778713226318, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6669096164405346, "reward_std": 0.10552376857958734, "rewards/accuracy_reward": 0.41326529532670975, "rewards/semantic_entropy_math_reward": 0.6669096313416958, "rewards/total_entropy_reward": 1.3323625773191452, "step": 38 }, { "completion_length": 725.7671966552734, "epoch": 0.4366689993002099, "grad_norm": 7.124528408050537, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6472303047776222, "reward_std": 0.10429512546397746, "rewards/accuracy_reward": 0.3909438718110323, "rewards/semantic_entropy_math_reward": 0.6472303159534931, "rewards/total_entropy_reward": 1.3546394035220146, "step": 39 }, { "completion_length": 697.7423439025879, "epoch": 0.44786564030790765, "grad_norm": 7.065954208374023, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7193877585232258, "reward_std": 0.09428266366012394, "rewards/accuracy_reward": 0.4336734600365162, "rewards/semantic_entropy_math_reward": 0.7193877547979355, "rewards/total_entropy_reward": 1.2143822945654392, "step": 40 }, { "completion_length": 703.3673400878906, "epoch": 0.45906228131560534, "grad_norm": 6.764792442321777, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.706632649526, "reward_std": 0.0942279752343893, "rewards/accuracy_reward": 0.4649234591051936, "rewards/semantic_entropy_math_reward": 0.7066326681524515, "rewards/total_entropy_reward": 1.238373503088951, "step": 41 }, { "completion_length": 651.4183578491211, "epoch": 0.470258922323303, "grad_norm": 16.47541618347168, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6813046522438526, "reward_std": 0.10622056131251156, "rewards/accuracy_reward": 0.43239795230329037, "rewards/semantic_entropy_math_reward": 0.681304682046175, "rewards/total_entropy_reward": 1.284136950969696, "step": 42 }, { "completion_length": 743.2646636962891, "epoch": 0.4814555633310007, "grad_norm": 15.174323081970215, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6525145824998617, "reward_std": 0.10325121926143765, "rewards/accuracy_reward": 0.3947704015299678, "rewards/semantic_entropy_math_reward": 0.652514586225152, "rewards/total_entropy_reward": 1.358573641628027, "step": 43 }, { "completion_length": 725.5962905883789, "epoch": 0.4926522043386984, "grad_norm": 7.421966075897217, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6867711264640093, "reward_std": 0.09842351987026632, "rewards/accuracy_reward": 0.4323979504406452, "rewards/semantic_entropy_math_reward": 0.6867711190134287, "rewards/total_entropy_reward": 1.2839920222759247, "step": 44 }, { "completion_length": 698.1390113830566, "epoch": 0.5038488453463961, "grad_norm": 9.836834907531738, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6851311810314655, "reward_std": 0.10096711455844343, "rewards/accuracy_reward": 0.44196427427232265, "rewards/semantic_entropy_math_reward": 0.6851312182843685, "rewards/total_entropy_reward": 1.290800966322422, "step": 45 }, { "completion_length": 737.8379898071289, "epoch": 0.5150454863540938, "grad_norm": 13.060182571411133, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6867711432278156, "reward_std": 0.10890168650075793, "rewards/accuracy_reward": 0.403061218559742, "rewards/semantic_entropy_math_reward": 0.6867711097002029, "rewards/total_entropy_reward": 1.2737670093774796, "step": 46 }, { "completion_length": 720.0937309265137, "epoch": 0.5262421273617914, "grad_norm": 17.424467086791992, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6268221419304609, "reward_std": 0.11281977966427803, "rewards/accuracy_reward": 0.39668366219848394, "rewards/semantic_entropy_math_reward": 0.6268221419304609, "rewards/total_entropy_reward": 1.396425575017929, "step": 47 }, { "completion_length": 730.9495983123779, "epoch": 0.5374387683694891, "grad_norm": 7.613661766052246, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6333818975836039, "reward_std": 0.09289166564121842, "rewards/accuracy_reward": 0.38201529905200005, "rewards/semantic_entropy_math_reward": 0.6333819199353456, "rewards/total_entropy_reward": 1.3891723416745663, "step": 48 }, { "completion_length": 730.4712867736816, "epoch": 0.5486354093771868, "grad_norm": 11.16511344909668, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.628097664564848, "reward_std": 0.10562112857587636, "rewards/accuracy_reward": 0.3871173355728388, "rewards/semantic_entropy_math_reward": 0.6280976608395576, "rewards/total_entropy_reward": 1.3984056264162064, "step": 49 }, { "completion_length": 700.4126129150391, "epoch": 0.5598320503848845, "grad_norm": 7.883020401000977, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6543367393314838, "reward_std": 0.08848634106107056, "rewards/accuracy_reward": 0.43494897056370974, "rewards/semantic_entropy_math_reward": 0.6543367449194193, "rewards/total_entropy_reward": 1.368666134774685, "step": 50 }, { "completion_length": 726.035701751709, "epoch": 0.5710286913925823, "grad_norm": 16.26348114013672, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6665451787412167, "reward_std": 0.09329186473041773, "rewards/accuracy_reward": 0.40114795323461294, "rewards/semantic_entropy_math_reward": 0.6665451973676682, "rewards/total_entropy_reward": 1.3389556668698788, "step": 51 }, { "completion_length": 719.875617980957, "epoch": 0.58222533240028, "grad_norm": 10.366020202636719, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6474125292152166, "reward_std": 0.10870802192948759, "rewards/accuracy_reward": 0.3934948956593871, "rewards/semantic_entropy_math_reward": 0.6474125385284424, "rewards/total_entropy_reward": 1.3537504002451897, "step": 52 }, { "completion_length": 691.538890838623, "epoch": 0.5934219734079776, "grad_norm": 10.94919204711914, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6860422715544701, "reward_std": 0.0946744061075151, "rewards/accuracy_reward": 0.489158159121871, "rewards/semantic_entropy_math_reward": 0.6860422790050507, "rewards/total_entropy_reward": 1.3007621616125107, "step": 53 }, { "completion_length": 705.672176361084, "epoch": 0.6046186144156753, "grad_norm": 23.8171329498291, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6762026119977236, "reward_std": 0.09615424065850675, "rewards/accuracy_reward": 0.39668366592377424, "rewards/semantic_entropy_math_reward": 0.6762026362121105, "rewards/total_entropy_reward": 1.3121707029640675, "step": 54 }, { "completion_length": 677.1664428710938, "epoch": 0.615815255423373, "grad_norm": 9.031086921691895, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6896865852177143, "reward_std": 0.1015757208224386, "rewards/accuracy_reward": 0.4317601965740323, "rewards/semantic_entropy_math_reward": 0.6896865479648113, "rewards/total_entropy_reward": 1.287308655679226, "step": 55 }, { "completion_length": 713.0114631652832, "epoch": 0.6270118964310707, "grad_norm": 10.871789932250977, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6656341031193733, "reward_std": 0.11424232507124543, "rewards/accuracy_reward": 0.4221938643604517, "rewards/semantic_entropy_math_reward": 0.6656341068446636, "rewards/total_entropy_reward": 1.3229641020298004, "step": 56 }, { "completion_length": 677.5694923400879, "epoch": 0.6382085374387684, "grad_norm": 8.904267311096191, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6729227248579264, "reward_std": 0.10780132608488202, "rewards/accuracy_reward": 0.4381377436220646, "rewards/semantic_entropy_math_reward": 0.6729227565228939, "rewards/total_entropy_reward": 1.3122014850378036, "step": 57 }, { "completion_length": 687.1345500946045, "epoch": 0.6494051784464661, "grad_norm": 19.945981979370117, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6621720213443041, "reward_std": 0.10787450219504535, "rewards/accuracy_reward": 0.4183673388324678, "rewards/semantic_entropy_math_reward": 0.6621720027178526, "rewards/total_entropy_reward": 1.3265771567821503, "step": 58 }, { "completion_length": 673.6224384307861, "epoch": 0.6606018194541637, "grad_norm": 25.872291564941406, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6900510117411613, "reward_std": 0.09328114939853549, "rewards/accuracy_reward": 0.3998724389821291, "rewards/semantic_entropy_math_reward": 0.6900510005652905, "rewards/total_entropy_reward": 1.2932880148291588, "step": 59 }, { "completion_length": 712.2027912139893, "epoch": 0.6717984604618614, "grad_norm": 12.222248077392578, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6426749210804701, "reward_std": 0.09614149574190378, "rewards/accuracy_reward": 0.41007652156986296, "rewards/semantic_entropy_math_reward": 0.6426749173551798, "rewards/total_entropy_reward": 1.383428543806076, "step": 60 }, { "completion_length": 739.6472969055176, "epoch": 0.6829951014695591, "grad_norm": 9.738767623901367, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6273687966167927, "reward_std": 0.11152111599221826, "rewards/accuracy_reward": 0.34885203186422586, "rewards/semantic_entropy_math_reward": 0.6273688077926636, "rewards/total_entropy_reward": 1.3971327617764473, "step": 61 }, { "completion_length": 685.2659378051758, "epoch": 0.6941917424772568, "grad_norm": 20.52920913696289, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6596209742128849, "reward_std": 0.10549976932816207, "rewards/accuracy_reward": 0.43622447922825813, "rewards/semantic_entropy_math_reward": 0.6596209909766912, "rewards/total_entropy_reward": 1.3266954682767391, "step": 62 }, { "completion_length": 684.1128730773926, "epoch": 0.7053883834849545, "grad_norm": 13.253375053405762, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6740160267800093, "reward_std": 0.11020976235158741, "rewards/accuracy_reward": 0.4419642761349678, "rewards/semantic_entropy_math_reward": 0.6740160342305899, "rewards/total_entropy_reward": 1.295092262327671, "step": 63 }, { "completion_length": 702.8647766113281, "epoch": 0.7165850244926522, "grad_norm": 11.436138153076172, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6612609177827835, "reward_std": 0.0923373675905168, "rewards/accuracy_reward": 0.43239795230329037, "rewards/semantic_entropy_math_reward": 0.661260936409235, "rewards/total_entropy_reward": 1.3441792502999306, "step": 64 }, { "completion_length": 713.3341636657715, "epoch": 0.72778166550035, "grad_norm": 15.771661758422852, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6313775349408388, "reward_std": 0.1141419094055891, "rewards/accuracy_reward": 0.4139030510559678, "rewards/semantic_entropy_math_reward": 0.6313775237649679, "rewards/total_entropy_reward": 1.3911906033754349, "step": 65 }, { "completion_length": 691.0420761108398, "epoch": 0.7389783065080476, "grad_norm": 9.556193351745605, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6576165966689587, "reward_std": 0.10697318986058235, "rewards/accuracy_reward": 0.3386479513719678, "rewards/semantic_entropy_math_reward": 0.657616626471281, "rewards/total_entropy_reward": 1.3421761691570282, "step": 66 }, { "completion_length": 688.1179618835449, "epoch": 0.7501749475157453, "grad_norm": 18.490732192993164, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6712827831506729, "reward_std": 0.10525703080929816, "rewards/accuracy_reward": 0.445790808647871, "rewards/semantic_entropy_math_reward": 0.6712828055024147, "rewards/total_entropy_reward": 1.3154872134327888, "step": 67 }, { "completion_length": 647.132007598877, "epoch": 0.761371588523443, "grad_norm": 7.94115686416626, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6920553781092167, "reward_std": 0.09936971426941454, "rewards/accuracy_reward": 0.43176019564270973, "rewards/semantic_entropy_math_reward": 0.6920554004609585, "rewards/total_entropy_reward": 1.2725396156311035, "step": 68 }, { "completion_length": 699.4005012512207, "epoch": 0.7725682295311407, "grad_norm": 12.146166801452637, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6652696616947651, "reward_std": 0.11806112038902938, "rewards/accuracy_reward": 0.409438768401742, "rewards/semantic_entropy_math_reward": 0.6652696691453457, "rewards/total_entropy_reward": 1.3024558648467064, "step": 69 }, { "completion_length": 667.9285659790039, "epoch": 0.7837648705388384, "grad_norm": 15.953444480895996, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.679846927523613, "reward_std": 0.09083303948864341, "rewards/accuracy_reward": 0.4311224427074194, "rewards/semantic_entropy_math_reward": 0.6798469312489033, "rewards/total_entropy_reward": 1.3062404170632362, "step": 70 }, { "completion_length": 705.4400367736816, "epoch": 0.794961511546536, "grad_norm": 12.320276260375977, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6556122340261936, "reward_std": 0.09324299031868577, "rewards/accuracy_reward": 0.37946427892893553, "rewards/semantic_entropy_math_reward": 0.6556122545152903, "rewards/total_entropy_reward": 1.3581575378775597, "step": 71 }, { "completion_length": 654.1511325836182, "epoch": 0.8061581525542337, "grad_norm": 22.0692138671875, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6794824972748756, "reward_std": 0.10328507493250072, "rewards/accuracy_reward": 0.4209183603525162, "rewards/semantic_entropy_math_reward": 0.6794825121760368, "rewards/total_entropy_reward": 1.2847715727984905, "step": 72 }, { "completion_length": 690.308666229248, "epoch": 0.8173547935619314, "grad_norm": 10.22040843963623, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6780247762799263, "reward_std": 0.105113809928298, "rewards/accuracy_reward": 0.45918366499245167, "rewards/semantic_entropy_math_reward": 0.678024772554636, "rewards/total_entropy_reward": 1.2880272567272186, "step": 73 }, { "completion_length": 696.0803337097168, "epoch": 0.8285514345696291, "grad_norm": 20.899675369262695, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6516034882515669, "reward_std": 0.10480827221181244, "rewards/accuracy_reward": 0.3852040730416775, "rewards/semantic_entropy_math_reward": 0.6516034882515669, "rewards/total_entropy_reward": 1.3282338082790375, "step": 74 }, { "completion_length": 649.8654270172119, "epoch": 0.8397480755773268, "grad_norm": 9.469245910644531, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7246719934046268, "reward_std": 0.08447014342527837, "rewards/accuracy_reward": 0.433035708963871, "rewards/semantic_entropy_math_reward": 0.7246720045804977, "rewards/total_entropy_reward": 1.220706269145012, "step": 75 }, { "completion_length": 746.2142791748047, "epoch": 0.8509447165850245, "grad_norm": 10.530217170715332, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.639395035803318, "reward_std": 0.10488821647595614, "rewards/accuracy_reward": 0.40816325321793556, "rewards/semantic_entropy_math_reward": 0.6393950544297695, "rewards/total_entropy_reward": 1.3760143592953682, "step": 76 }, { "completion_length": 655.6836585998535, "epoch": 0.8621413575927221, "grad_norm": 14.1907958984375, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7228498421609402, "reward_std": 0.09376341942697763, "rewards/accuracy_reward": 0.4674744727090001, "rewards/semantic_entropy_math_reward": 0.7228498421609402, "rewards/total_entropy_reward": 1.2132117934525013, "step": 77 }, { "completion_length": 672.8137626647949, "epoch": 0.8733379986004198, "grad_norm": 13.5209379196167, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.652332354336977, "reward_std": 0.09460025187581778, "rewards/accuracy_reward": 0.3858418334275484, "rewards/semantic_entropy_math_reward": 0.6523323617875576, "rewards/total_entropy_reward": 1.351367250084877, "step": 78 }, { "completion_length": 696.9342880249023, "epoch": 0.8845346396081175, "grad_norm": 17.302839279174805, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6514212656766176, "reward_std": 0.09626807877793908, "rewards/accuracy_reward": 0.4062499897554517, "rewards/semantic_entropy_math_reward": 0.651421282440424, "rewards/total_entropy_reward": 1.366665042936802, "step": 79 }, { "completion_length": 682.3073806762695, "epoch": 0.8957312806158153, "grad_norm": 9.640515327453613, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6767492536455393, "reward_std": 0.10075846454128623, "rewards/accuracy_reward": 0.42028060276061296, "rewards/semantic_entropy_math_reward": 0.6767492648214102, "rewards/total_entropy_reward": 1.3104709684848785, "step": 80 }, { "completion_length": 722.4725646972656, "epoch": 0.906927921623513, "grad_norm": 6.948408603668213, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6521501280367374, "reward_std": 0.08798706252127886, "rewards/accuracy_reward": 0.4534438718110323, "rewards/semantic_entropy_math_reward": 0.6521501541137695, "rewards/total_entropy_reward": 1.3641655445098877, "step": 81 }, { "completion_length": 686.7238388061523, "epoch": 0.9181245626312107, "grad_norm": 47.02455139160156, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6811224482953548, "reward_std": 0.11016466980800033, "rewards/accuracy_reward": 0.42283162754029036, "rewards/semantic_entropy_math_reward": 0.6811224408447742, "rewards/total_entropy_reward": 1.2916913330554962, "step": 82 }, { "completion_length": 709.4598007202148, "epoch": 0.9293212036389084, "grad_norm": 31.659669876098633, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6086005661636591, "reward_std": 0.10625140275806189, "rewards/accuracy_reward": 0.37882652692496777, "rewards/semantic_entropy_math_reward": 0.60860057733953, "rewards/total_entropy_reward": 1.4536337479948997, "step": 83 }, { "completion_length": 661.408784866333, "epoch": 0.940517844646606, "grad_norm": 8.08792781829834, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6858600489795208, "reward_std": 0.09285477036610246, "rewards/accuracy_reward": 0.4221938652917743, "rewards/semantic_entropy_math_reward": 0.6858600452542305, "rewards/total_entropy_reward": 1.283115666359663, "step": 84 }, { "completion_length": 657.9400405883789, "epoch": 0.9517144856543037, "grad_norm": 21.801326751708984, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6805757954716682, "reward_std": 0.09728616522625089, "rewards/accuracy_reward": 0.43749999441206455, "rewards/semantic_entropy_math_reward": 0.6805758327245712, "rewards/total_entropy_reward": 1.3078671097755432, "step": 85 }, { "completion_length": 674.7142734527588, "epoch": 0.9629111266620014, "grad_norm": 11.559253692626953, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6862244829535484, "reward_std": 0.10050268471240997, "rewards/accuracy_reward": 0.4304846851155162, "rewards/semantic_entropy_math_reward": 0.686224490404129, "rewards/total_entropy_reward": 1.2905268669128418, "step": 86 }, { "completion_length": 636.0388870239258, "epoch": 0.9741077676696991, "grad_norm": 11.289789199829102, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6986151486635208, "reward_std": 0.10155197884887457, "rewards/accuracy_reward": 0.43686223216354847, "rewards/semantic_entropy_math_reward": 0.6986151374876499, "rewards/total_entropy_reward": 1.2592380531132221, "step": 87 }, { "completion_length": 683.5822525024414, "epoch": 0.9853044086773968, "grad_norm": 10.526566505432129, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6585276797413826, "reward_std": 0.10897616110742092, "rewards/accuracy_reward": 0.4017857098951936, "rewards/semantic_entropy_math_reward": 0.6585277020931244, "rewards/total_entropy_reward": 1.324688896536827, "step": 88 }, { "completion_length": 683.3411960601807, "epoch": 0.9965010496850945, "grad_norm": 19.354259490966797, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.6687317825853825, "reward_std": 0.09195416304282844, "rewards/accuracy_reward": 0.380102027207613, "rewards/semantic_entropy_math_reward": 0.6687317807227373, "rewards/total_entropy_reward": 1.3135743215680122, "step": 89 }, { "epoch": 0.9965010496850945, "step": 89, "total_flos": 0.0, "train_loss": 2.8008765204773544e-08, "train_runtime": 57265.8095, "train_samples_per_second": 0.349, "train_steps_per_second": 0.002 } ], "logging_steps": 1, "max_steps": 89, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }