{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5306122448979593, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 294.625, "epoch": 0.00510204081632653, "grad_norm": 0.8185210227966309, "kl": 0.0, "learning_rate": 8.474576271186442e-08, "loss": -0.0, "reward": 2.322195529937744, "reward_std": 3.603352189064026, "rewards/_soft_format_reward_func": 0.6937499940395355, "rewards/_strict_format_reward_func": 1.5, "rewards/_xml_count_reward_func": -0.6404999978840351, "rewards/check_answer": 0.7689455151557922, "step": 1 }, { "completion_length": 290.625, "epoch": 0.01020408163265306, "grad_norm": 2.1359550952911377, "kl": 0.0, "learning_rate": 1.6949152542372883e-07, "loss": -0.0, "reward": 1.21162611246109, "reward_std": 1.5170034170150757, "rewards/_soft_format_reward_func": -0.30000001192092896, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.29725000262260437, "rewards/check_answer": 0.8713762287516147, "step": 2 }, { "completion_length": 609.4375, "epoch": 0.015306122448979591, "grad_norm": 0.19122786819934845, "kl": 0.0015784860006533563, "learning_rate": 2.5423728813559323e-07, "loss": 0.0001, "reward": 50.498586282134056, "reward_std": 64.99645301699638, "rewards/_soft_format_reward_func": 1.1125000044703484, "rewards/_strict_format_reward_func": 1.6875, "rewards/_xml_count_reward_func": -2.054562598466873, "rewards/check_answer": 49.753145925700665, "step": 3 }, { "completion_length": 376.9375, "epoch": 0.02040816326530612, "grad_norm": 0.9175134897232056, "kl": 0.002859612286556512, "learning_rate": 3.3898305084745766e-07, "loss": 0.0001, "reward": 1.4672349244356155, "reward_std": 3.2955686151981354, "rewards/_soft_format_reward_func": 0.11249999701976776, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.5276250019669533, "rewards/check_answer": 0.9448598623275757, "step": 4 }, { "completion_length": 330.25, "epoch": 0.025510204081632654, "grad_norm": 1.0718908309936523, "kl": 0.0023224337492138147, "learning_rate": 4.2372881355932204e-07, "loss": 0.0001, "reward": -1.1594912707805634, "reward_std": 1.1251797080039978, "rewards/_soft_format_reward_func": -1.3875000029802322, "rewards/_strict_format_reward_func": 0.375, "rewards/_xml_count_reward_func": -0.14700000081211329, "rewards/check_answer": 8.726535270398017e-06, "step": 5 }, { "completion_length": 262.3125, "epoch": 0.030612244897959183, "grad_norm": 0.42630404233932495, "kl": 0.005721980705857277, "learning_rate": 5.084745762711865e-07, "loss": 0.0002, "reward": 0.7644375264644623, "reward_std": 1.8905271589756012, "rewards/_soft_format_reward_func": 0.1875, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.3605625070631504, "rewards/check_answer": 0.0, "step": 6 }, { "completion_length": 346.4375, "epoch": 0.03571428571428571, "grad_norm": 0.9312261343002319, "kl": 0.0050744940230735835, "learning_rate": 5.93220338983051e-07, "loss": 0.0002, "reward": -1.572375014424324, "reward_std": 0.5705349743366241, "rewards/_soft_format_reward_func": -1.1875, "rewards/_strict_format_reward_func": 0.0, "rewards/_xml_count_reward_func": -0.38487499207258224, "rewards/check_answer": 0.0, "step": 7 }, { "completion_length": 224.1875, "epoch": 0.04081632653061224, "grad_norm": 1.3169041872024536, "kl": 0.0011199476284673437, "learning_rate": 6.779661016949153e-07, "loss": 0.0, "reward": -0.09437501430511475, "reward_std": 0.8001106679439545, "rewards/_soft_format_reward_func": -0.5625, "rewards/_strict_format_reward_func": 0.75, "rewards/_xml_count_reward_func": -0.28187501011416316, "rewards/check_answer": 0.0, "step": 8 }, { "completion_length": 288.5, "epoch": 0.04591836734693878, "grad_norm": 0.32011842727661133, "kl": 0.00206242610784102, "learning_rate": 7.627118644067798e-07, "loss": 0.0001, "reward": 0.9716752767562866, "reward_std": 1.5422732569277287, "rewards/_soft_format_reward_func": -0.5, "rewards/_strict_format_reward_func": 1.125, "rewards/_xml_count_reward_func": -0.6016249805688858, "rewards/check_answer": 0.9483002722263336, "step": 9 }, { "completion_length": 428.5625, "epoch": 0.05102040816326531, "grad_norm": 1.1417263746261597, "kl": 0.0029192589954618597, "learning_rate": 8.474576271186441e-07, "loss": 0.0001, "reward": -0.809456929564476, "reward_std": 1.531682014465332, "rewards/_soft_format_reward_func": -1.199999988079071, "rewards/_strict_format_reward_func": 0.375, "rewards/_xml_count_reward_func": -0.0820000022649765, "rewards/check_answer": 0.09754307568073273, "step": 10 }, { "completion_length": 407.625, "epoch": 0.05612244897959184, "grad_norm": 0.32301968336105347, "kl": 0.0022870840039104223, "learning_rate": 9.322033898305086e-07, "loss": 0.0001, "reward": 1.1774811148643494, "reward_std": 1.3392982184886932, "rewards/_soft_format_reward_func": 0.05624997615814209, "rewards/_strict_format_reward_func": 1.5, "rewards/_xml_count_reward_func": -0.42518749833106995, "rewards/check_answer": 0.04641865938901901, "step": 11 }, { "completion_length": 298.6875, "epoch": 0.061224489795918366, "grad_norm": 0.37453481554985046, "kl": 0.003994872371947622, "learning_rate": 1.016949152542373e-06, "loss": 0.0002, "reward": 0.19297951459884644, "reward_std": 1.9170226454734802, "rewards/_soft_format_reward_func": -0.75, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.414187490940094, "rewards/check_answer": 0.4196670353412628, "step": 12 }, { "completion_length": 590.1875, "epoch": 0.0663265306122449, "grad_norm": 0.661213219165802, "kl": 0.00343362707644701, "learning_rate": 1.1016949152542374e-06, "loss": 0.0002, "reward": 1.760912761092186, "reward_std": 3.0866269270627527, "rewards/_soft_format_reward_func": -0.5687500089406967, "rewards/_strict_format_reward_func": 1.3125, "rewards/_xml_count_reward_func": -0.22106249630451202, "rewards/check_answer": 1.2382252807728946, "step": 13 }, { "completion_length": 342.5, "epoch": 0.07142857142857142, "grad_norm": 0.40530863404273987, "kl": 0.004755240981467068, "learning_rate": 1.186440677966102e-06, "loss": 0.0002, "reward": 1.6868359446525574, "reward_std": 2.877818286418915, "rewards/_soft_format_reward_func": -0.8250000029802322, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.011312499642372131, "rewards/check_answer": 1.5856482982635498, "step": 14 }, { "completion_length": 338.9375, "epoch": 0.07653061224489796, "grad_norm": 4.245687961578369, "kl": 0.06816600821912289, "learning_rate": 1.2711864406779662e-06, "loss": 0.0027, "reward": 3.4960225969552994, "reward_std": 4.042192316614091, "rewards/_soft_format_reward_func": 0.25, "rewards/_strict_format_reward_func": 1.125, "rewards/_xml_count_reward_func": -0.5118750035762787, "rewards/check_answer": 2.6328976154327393, "step": 15 }, { "completion_length": 461.25, "epoch": 0.08163265306122448, "grad_norm": 0.6524187922477722, "kl": 0.003892036440447555, "learning_rate": 1.3559322033898307e-06, "loss": 0.0002, "reward": -0.08632761240005493, "reward_std": 1.6764180362224579, "rewards/_soft_format_reward_func": -0.2874999940395355, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -1.1476250290870667, "rewards/check_answer": 0.41129739210009575, "step": 16 }, { "completion_length": 300.3125, "epoch": 0.08673469387755102, "grad_norm": 0.2593589723110199, "kl": 0.0008242716470050482, "learning_rate": 1.4406779661016951e-06, "loss": 0.0, "reward": -1.24406249076128, "reward_std": 0.6096278727054596, "rewards/_soft_format_reward_func": -1.1312500014901161, "rewards/_strict_format_reward_func": 0.0, "rewards/_xml_count_reward_func": -0.11281250417232513, "rewards/check_answer": 0.0, "step": 17 }, { "completion_length": 412.625, "epoch": 0.09183673469387756, "grad_norm": 0.929692804813385, "kl": 0.005735803686548024, "learning_rate": 1.5254237288135596e-06, "loss": 0.0002, "reward": 0.31456413865089417, "reward_std": 2.1592386066913605, "rewards/_soft_format_reward_func": 0.25, "rewards/_strict_format_reward_func": 0.75, "rewards/_xml_count_reward_func": -0.9343749992549419, "rewards/check_answer": 0.24893911182880402, "step": 18 }, { "completion_length": 470.0625, "epoch": 0.09693877551020408, "grad_norm": 0.3417404890060425, "kl": 0.00579645624384284, "learning_rate": 1.6101694915254237e-06, "loss": 0.0002, "reward": 0.22844918817281723, "reward_std": 1.6300125122070312, "rewards/_soft_format_reward_func": -0.1875, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -1.1502499729394913, "rewards/check_answer": 0.62869917973876, "step": 19 }, { "completion_length": 380.625, "epoch": 0.10204081632653061, "grad_norm": 267.041259765625, "kl": 0.9507867273296142, "learning_rate": 1.6949152542372882e-06, "loss": 0.038, "reward": -0.4774259477853775, "reward_std": 1.764455109834671, "rewards/_soft_format_reward_func": -0.6875, "rewards/_strict_format_reward_func": 0.1875, "rewards/_xml_count_reward_func": -0.46918751299381256, "rewards/check_answer": 0.49176159501075745, "step": 20 }, { "completion_length": 414.75, "epoch": 0.10714285714285714, "grad_norm": 0.3368138372898102, "kl": 0.003109428856987506, "learning_rate": 1.7796610169491526e-06, "loss": 0.0001, "reward": -0.7655205726623535, "reward_std": 1.0618179142475128, "rewards/_soft_format_reward_func": -0.6875, "rewards/_strict_format_reward_func": 0.5625, "rewards/_xml_count_reward_func": -0.7773125171661377, "rewards/check_answer": 0.1367919147014618, "step": 21 }, { "completion_length": 671.0, "epoch": 0.11224489795918367, "grad_norm": 0.9535510540008545, "kl": 0.003949811041820794, "learning_rate": 1.8644067796610171e-06, "loss": 0.0002, "reward": 0.8916858434677124, "reward_std": 0.37709038180764765, "rewards/_soft_format_reward_func": -0.4312499910593033, "rewards/_strict_format_reward_func": 1.5, "rewards/_xml_count_reward_func": -0.35756251215934753, "rewards/check_answer": 0.18049834482371807, "step": 22 }, { "completion_length": 683.625, "epoch": 0.11734693877551021, "grad_norm": 1.632488489151001, "kl": 0.03495925866445759, "learning_rate": 1.9491525423728816e-06, "loss": 0.0014, "reward": 0.82306969165802, "reward_std": 1.9999099373817444, "rewards/_soft_format_reward_func": -0.5499999970197678, "rewards/_strict_format_reward_func": 1.3125, "rewards/_xml_count_reward_func": -0.26243748515844345, "rewards/check_answer": 0.3230072185397148, "step": 23 }, { "completion_length": 381.0625, "epoch": 0.12244897959183673, "grad_norm": 0.2804034352302551, "kl": 0.012196791227324866, "learning_rate": 2.033898305084746e-06, "loss": 0.0005, "reward": -0.9751249700784683, "reward_std": 1.2838140726089478, "rewards/_soft_format_reward_func": -1.125, "rewards/_strict_format_reward_func": 0.375, "rewards/_xml_count_reward_func": -0.2251249998807907, "rewards/check_answer": 0.0, "step": 24 }, { "completion_length": 473.0625, "epoch": 0.12755102040816327, "grad_norm": 0.4001588523387909, "kl": 0.002974389062728733, "learning_rate": 2.11864406779661e-06, "loss": 0.0001, "reward": 2.849132001399994, "reward_std": 1.8594820201396942, "rewards/_soft_format_reward_func": 0.6500000059604645, "rewards/_strict_format_reward_func": 1.875, "rewards/_xml_count_reward_func": -0.8167499899864197, "rewards/check_answer": 1.1408820822834969, "step": 25 }, { "completion_length": 522.0, "epoch": 0.1326530612244898, "grad_norm": 0.2913358211517334, "kl": 0.004198311798973009, "learning_rate": 2.203389830508475e-06, "loss": 0.0002, "reward": 0.5667000897228718, "reward_std": 1.8786026984453201, "rewards/_soft_format_reward_func": 0.08750000596046448, "rewards/_strict_format_reward_func": 0.75, "rewards/_xml_count_reward_func": -0.484437495470047, "rewards/check_answer": 0.21363750100135803, "step": 26 }, { "completion_length": 324.3125, "epoch": 0.1377551020408163, "grad_norm": 0.377543568611145, "kl": 0.005379673675633967, "learning_rate": 2.288135593220339e-06, "loss": 0.0002, "reward": 1.7930006384849548, "reward_std": 2.12799359485507, "rewards/_soft_format_reward_func": 0.375, "rewards/_strict_format_reward_func": 1.5, "rewards/_xml_count_reward_func": -0.7957500219345093, "rewards/check_answer": 0.7137506082653999, "step": 27 }, { "completion_length": 372.5, "epoch": 0.14285714285714285, "grad_norm": 0.37043654918670654, "kl": 0.003904464378138073, "learning_rate": 2.372881355932204e-06, "loss": 0.0002, "reward": 0.6625258177518845, "reward_std": 2.1411255598068237, "rewards/_soft_format_reward_func": 0.1875, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.8833125308156013, "rewards/check_answer": 0.4208383299410343, "step": 28 }, { "completion_length": 414.5625, "epoch": 0.14795918367346939, "grad_norm": 0.37689441442489624, "kl": 0.0018855973307836393, "learning_rate": 2.457627118644068e-06, "loss": 0.0001, "reward": -0.34431251883506775, "reward_std": 0.6518816608004272, "rewards/_soft_format_reward_func": -0.6875, "rewards/_strict_format_reward_func": 0.5625, "rewards/_xml_count_reward_func": -0.21931251138448715, "rewards/check_answer": 6.078471059822732e-15, "step": 29 }, { "completion_length": 471.0625, "epoch": 0.15306122448979592, "grad_norm": 0.32698705792427063, "kl": 0.0054136388207552955, "learning_rate": 2.5423728813559323e-06, "loss": 0.0002, "reward": 1.2012260109186172, "reward_std": 1.9618901312351227, "rewards/_soft_format_reward_func": 0.3812499940395355, "rewards/_strict_format_reward_func": 1.125, "rewards/_xml_count_reward_func": -1.0806874781847, "rewards/check_answer": 0.7756634612169364, "step": 30 }, { "completion_length": 543.1875, "epoch": 0.15816326530612246, "grad_norm": 0.6773600578308105, "kl": 0.009602147212717682, "learning_rate": 2.627118644067797e-06, "loss": 0.0004, "reward": 16.712120667099953, "reward_std": 31.081469893455505, "rewards/_soft_format_reward_func": 0.375, "rewards/_strict_format_reward_func": 1.5, "rewards/_xml_count_reward_func": -1.4680624902248383, "rewards/check_answer": 16.305183589458466, "step": 31 }, { "completion_length": 405.5, "epoch": 0.16326530612244897, "grad_norm": 2.430391311645508, "kl": 0.016679312742780894, "learning_rate": 2.7118644067796613e-06, "loss": 0.0007, "reward": -0.05791878700256348, "reward_std": 0.9276261329650879, "rewards/_soft_format_reward_func": -0.824999988079071, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.3491249978542328, "rewards/check_answer": 0.17870615608990192, "step": 32 }, { "completion_length": 367.6875, "epoch": 0.1683673469387755, "grad_norm": 0.8208937644958496, "kl": 0.008234784821979702, "learning_rate": 2.7966101694915256e-06, "loss": 0.0003, "reward": 6.702535092830658, "reward_std": 4.863340765237808, "rewards/_soft_format_reward_func": 1.5562500059604645, "rewards/_strict_format_reward_func": 2.0625, "rewards/_xml_count_reward_func": -1.0955625176429749, "rewards/check_answer": 4.179348034758277, "step": 33 }, { "completion_length": 670.8125, "epoch": 0.17346938775510204, "grad_norm": 0.17907316982746124, "kl": 0.0034322862866247306, "learning_rate": 2.8813559322033903e-06, "loss": 0.0001, "reward": 2.961810827255249, "reward_std": 7.308291792869568, "rewards/_soft_format_reward_func": -0.8562500029802322, "rewards/_strict_format_reward_func": 1.125, "rewards/_xml_count_reward_func": -0.43849998712539673, "rewards/check_answer": 3.131560802459717, "step": 34 }, { "completion_length": 387.125, "epoch": 0.17857142857142858, "grad_norm": 0.18726573884487152, "kl": 0.002935138749307953, "learning_rate": 2.9661016949152545e-06, "loss": 0.0001, "reward": 1.8329545259475708, "reward_std": 3.412140369415283, "rewards/_soft_format_reward_func": -0.3375000059604645, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.8870000243186951, "rewards/check_answer": 2.1199543476104736, "step": 35 }, { "completion_length": 739.5, "epoch": 0.1836734693877551, "grad_norm": 0.32407084107398987, "kl": 0.0060507280577439815, "learning_rate": 3.0508474576271192e-06, "loss": 0.0002, "reward": 0.4319094121456146, "reward_std": 2.4602610170841217, "rewards/_soft_format_reward_func": -0.13124999403953552, "rewards/_strict_format_reward_func": 0.9375, "rewards/_xml_count_reward_func": -0.9989374801516533, "rewards/check_answer": 0.6245969533920288, "step": 36 }, { "completion_length": 401.8125, "epoch": 0.18877551020408162, "grad_norm": 0.4821971654891968, "kl": 0.010924356349278241, "learning_rate": 3.135593220338983e-06, "loss": 0.0004, "reward": 1.8411645293235779, "reward_std": 3.823741763830185, "rewards/_soft_format_reward_func": 0.25, "rewards/_strict_format_reward_func": 1.125, "rewards/_xml_count_reward_func": -0.885937524959445, "rewards/check_answer": 1.3521020412445068, "step": 37 }, { "completion_length": 368.375, "epoch": 0.19387755102040816, "grad_norm": 0.6958189606666565, "kl": 0.01290791796054691, "learning_rate": 3.2203389830508473e-06, "loss": 0.0005, "reward": 1.7745112180709839, "reward_std": 2.5693418979644775, "rewards/_soft_format_reward_func": 1.1875, "rewards/_strict_format_reward_func": 1.125, "rewards/_xml_count_reward_func": -1.0533749610185623, "rewards/check_answer": 0.5153862088918686, "step": 38 }, { "completion_length": 826.125, "epoch": 0.1989795918367347, "grad_norm": 0.3377620279788971, "kl": 0.006919818581081927, "learning_rate": 3.305084745762712e-06, "loss": 0.0003, "reward": 35.50946241617203, "reward_std": 28.118246495723724, "rewards/_soft_format_reward_func": 1.4312500059604645, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.6590625643730164, "rewards/check_answer": 32.92477425445395, "step": 39 }, { "completion_length": 334.375, "epoch": 0.20408163265306123, "grad_norm": 1.3999114036560059, "kl": 0.011267256457358599, "learning_rate": 3.3898305084745763e-06, "loss": 0.0005, "reward": 1.117626965045929, "reward_std": 2.5586954951286316, "rewards/_soft_format_reward_func": 0.4375, "rewards/_strict_format_reward_func": 1.125, "rewards/_xml_count_reward_func": -0.8491249866783619, "rewards/check_answer": 0.40425196290016174, "step": 40 }, { "completion_length": 390.125, "epoch": 0.20918367346938777, "grad_norm": 1.1846420764923096, "kl": 0.01798212551511824, "learning_rate": 3.474576271186441e-06, "loss": 0.0007, "reward": 4.644592732191086, "reward_std": 1.390872847288847, "rewards/_soft_format_reward_func": 1.793749988079071, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -1.6293749511241913, "rewards/check_answer": 2.0427176877856255, "step": 41 }, { "completion_length": 476.1875, "epoch": 0.21428571428571427, "grad_norm": 0.6164702773094177, "kl": 0.047966267447918653, "learning_rate": 3.5593220338983053e-06, "loss": 0.0019, "reward": 2.8454742431640625, "reward_std": 2.382638132199645, "rewards/_soft_format_reward_func": 1.425000011920929, "rewards/_strict_format_reward_func": 2.0625, "rewards/_xml_count_reward_func": -1.6521874964237213, "rewards/check_answer": 1.0101617649197578, "step": 42 }, { "completion_length": 325.375, "epoch": 0.2193877551020408, "grad_norm": 1.3243409395217896, "kl": 0.01102221303153783, "learning_rate": 3.6440677966101695e-06, "loss": 0.0004, "reward": 4.189052075147629, "reward_std": 3.7774379551410675, "rewards/_soft_format_reward_func": 1.262499988079071, "rewards/_strict_format_reward_func": 1.6875, "rewards/_xml_count_reward_func": -0.871749997138977, "rewards/check_answer": 2.1108021295513026, "step": 43 }, { "completion_length": 483.4375, "epoch": 0.22448979591836735, "grad_norm": 2.6776297092437744, "kl": 0.024570247973315418, "learning_rate": 3.7288135593220342e-06, "loss": 0.001, "reward": 3.3171424567699432, "reward_std": 1.9457294531166553, "rewards/_soft_format_reward_func": 0.9500000029802322, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -0.710875004529953, "rewards/check_answer": 0.6405174862593412, "step": 44 }, { "completion_length": 401.875, "epoch": 0.22959183673469388, "grad_norm": 1.8040707111358643, "kl": 0.021586093585938215, "learning_rate": 3.8135593220338985e-06, "loss": 0.0009, "reward": 3.3045076727867126, "reward_std": 1.9318298771977425, "rewards/_soft_format_reward_func": 1.300000011920929, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -1.3513749986886978, "rewards/check_answer": 1.1058825515210629, "step": 45 }, { "completion_length": 442.1875, "epoch": 0.23469387755102042, "grad_norm": 0.28188657760620117, "kl": 0.01355510693974793, "learning_rate": 3.898305084745763e-06, "loss": 0.0005, "reward": 5.497533082962036, "reward_std": 5.00154435634613, "rewards/_soft_format_reward_func": 1.2312500029802322, "rewards/_strict_format_reward_func": 2.0625, "rewards/_xml_count_reward_func": -1.8056249618530273, "rewards/check_answer": 4.009408250451088, "step": 46 }, { "completion_length": 372.875, "epoch": 0.23979591836734693, "grad_norm": 0.5053686499595642, "kl": 0.010700618498958647, "learning_rate": 3.9830508474576275e-06, "loss": 0.0004, "reward": 22.94883681833744, "reward_std": 10.753062516450882, "rewards/_soft_format_reward_func": 1.375, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -1.2283124923706055, "rewards/check_answer": 20.55214899405837, "step": 47 }, { "completion_length": 394.5, "epoch": 0.24489795918367346, "grad_norm": 7.864255428314209, "kl": 0.010628446761984378, "learning_rate": 4.067796610169492e-06, "loss": 0.0004, "reward": 4.394192218780518, "reward_std": 1.785563262179494, "rewards/_soft_format_reward_func": 1.5187499970197678, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.218437522649765, "rewards/check_answer": 1.4688798922579736, "step": 48 }, { "completion_length": 725.375, "epoch": 0.25, "grad_norm": 0.9599105715751648, "kl": 0.012451534566935152, "learning_rate": 4.152542372881356e-06, "loss": 0.0005, "reward": 2.9797146916389465, "reward_std": 1.3605367243289948, "rewards/_soft_format_reward_func": 1.0562499985098839, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -0.7503125071525574, "rewards/check_answer": 0.42377725534606725, "step": 49 }, { "completion_length": 718.1875, "epoch": 0.25510204081632654, "grad_norm": 0.4486681818962097, "kl": 0.013971900450997055, "learning_rate": 4.23728813559322e-06, "loss": 0.0006, "reward": 51.57847714424133, "reward_std": 17.521905459463596, "rewards/_soft_format_reward_func": 1.59375, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.9316873885691166, "rewards/check_answer": 49.29141854145564, "step": 50 }, { "completion_length": 378.375, "epoch": 0.2602040816326531, "grad_norm": 1.6755878925323486, "kl": 0.015725657111033797, "learning_rate": 4.322033898305085e-06, "loss": 0.0006, "reward": 4.97844672203064, "reward_std": 1.7446883618831635, "rewards/_soft_format_reward_func": 1.875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.4906874597072601, "rewards/check_answer": 1.9691341519355774, "step": 51 }, { "completion_length": 314.125, "epoch": 0.2653061224489796, "grad_norm": 0.6237319707870483, "kl": 0.02619828935712576, "learning_rate": 4.40677966101695e-06, "loss": 0.001, "reward": 6.345100581645966, "reward_std": 2.3465639874339104, "rewards/_soft_format_reward_func": 1.875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.2206874787807465, "rewards/check_answer": 3.065787836909294, "step": 52 }, { "completion_length": 451.4375, "epoch": 0.27040816326530615, "grad_norm": 3.745959758758545, "kl": 0.0600818342063576, "learning_rate": 4.491525423728814e-06, "loss": 0.0024, "reward": 5.12294328212738, "reward_std": 1.3810148686170578, "rewards/_soft_format_reward_func": 1.6499999910593033, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.5819375328719616, "rewards/check_answer": 2.4298809214815265, "step": 53 }, { "completion_length": 522.8125, "epoch": 0.2755102040816326, "grad_norm": 1.2545146942138672, "kl": 0.014883615309372544, "learning_rate": 4.576271186440678e-06, "loss": 0.0006, "reward": 3.6991968154907227, "reward_std": 2.190860778093338, "rewards/_soft_format_reward_func": 1.5125000029802322, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -1.3406250104308128, "rewards/check_answer": 1.0898219845694257, "step": 54 }, { "completion_length": 410.6875, "epoch": 0.28061224489795916, "grad_norm": 16.85138702392578, "kl": 0.01683287846390158, "learning_rate": 4.6610169491525425e-06, "loss": 0.0007, "reward": 3.6215168833732605, "reward_std": 2.4318079613149166, "rewards/_soft_format_reward_func": 1.6312499940395355, "rewards/_strict_format_reward_func": 2.0625, "rewards/_xml_count_reward_func": -1.533000037074089, "rewards/check_answer": 1.460766777396202, "step": 55 }, { "completion_length": 371.125, "epoch": 0.2857142857142857, "grad_norm": 15.180438041687012, "kl": 0.02064416464418173, "learning_rate": 4.745762711864408e-06, "loss": 0.0008, "reward": 4.6238145381212234, "reward_std": 2.060830157250166, "rewards/_soft_format_reward_func": 1.5062499940395355, "rewards/_strict_format_reward_func": 2.0625, "rewards/_xml_count_reward_func": -1.0030625015497208, "rewards/check_answer": 2.0581270148977637, "step": 56 }, { "completion_length": 428.125, "epoch": 0.29081632653061223, "grad_norm": 0.3310839533805847, "kl": 0.018375703308265656, "learning_rate": 4.830508474576272e-06, "loss": 0.0007, "reward": 2.3490172177553177, "reward_std": 1.6269982382655144, "rewards/_soft_format_reward_func": 1.3687500059604645, "rewards/_strict_format_reward_func": 1.875, "rewards/_xml_count_reward_func": -1.6375625133514404, "rewards/check_answer": 0.742829842492938, "step": 57 }, { "completion_length": 404.25, "epoch": 0.29591836734693877, "grad_norm": 0.34096795320510864, "kl": 0.010851162485778332, "learning_rate": 4.915254237288136e-06, "loss": 0.0004, "reward": 3.4022790044546127, "reward_std": 1.133506953716278, "rewards/_soft_format_reward_func": 1.100000023841858, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -1.3487499952316284, "rewards/check_answer": 1.2135289967991412, "step": 58 }, { "completion_length": 316.5, "epoch": 0.3010204081632653, "grad_norm": 3.706521511077881, "kl": 0.016447525937110186, "learning_rate": 5e-06, "loss": 0.0007, "reward": 4.640098571777344, "reward_std": 2.5934594720602036, "rewards/_soft_format_reward_func": 1.675000011920929, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -1.1507499814033508, "rewards/check_answer": 1.8658485412597656, "step": 59 }, { "completion_length": 467.625, "epoch": 0.30612244897959184, "grad_norm": 0.40216565132141113, "kl": 0.016874468652531505, "learning_rate": 4.999955914361218e-06, "loss": 0.0007, "reward": 3.062375247478485, "reward_std": 2.366086855530739, "rewards/_soft_format_reward_func": 1.2937500029802322, "rewards/_strict_format_reward_func": 2.0625, "rewards/_xml_count_reward_func": -1.407749943435192, "rewards/check_answer": 1.1138752102851868, "step": 60 }, { "completion_length": 596.0, "epoch": 0.3112244897959184, "grad_norm": 0.38576120138168335, "kl": 0.01672750909347087, "learning_rate": 4.999823658999708e-06, "loss": 0.0007, "reward": 4.684498995542526, "reward_std": 0.9209771901369095, "rewards/_soft_format_reward_func": 1.7750000059604645, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.5980625189840794, "rewards/check_answer": 1.5075614899396896, "step": 61 }, { "completion_length": 298.25, "epoch": 0.3163265306122449, "grad_norm": 21.191478729248047, "kl": 0.06260064756497741, "learning_rate": 4.999603238579919e-06, "loss": 0.0025, "reward": 5.105346739292145, "reward_std": 0.8463521376252174, "rewards/_soft_format_reward_func": 1.918749988079071, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.2289375066757202, "rewards/check_answer": 1.6030341610312462, "step": 62 }, { "completion_length": 545.1875, "epoch": 0.32142857142857145, "grad_norm": 0.6183257102966309, "kl": 0.010101811029016972, "learning_rate": 4.999294660875751e-06, "loss": 0.0004, "reward": 4.376120448112488, "reward_std": 1.4407944083213806, "rewards/_soft_format_reward_func": 1.75, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.8306874781847, "rewards/check_answer": 1.6443080008029938, "step": 63 }, { "completion_length": 491.875, "epoch": 0.32653061224489793, "grad_norm": 0.27649980783462524, "kl": 0.0261327491607517, "learning_rate": 4.998897936770281e-06, "loss": 0.001, "reward": 2.1283541917800903, "reward_std": 1.4720253944396973, "rewards/_soft_format_reward_func": 1.2000000178813934, "rewards/_strict_format_reward_func": 1.875, "rewards/_xml_count_reward_func": -1.7337499856948853, "rewards/check_answer": 0.7871042089536786, "step": 64 }, { "completion_length": 419.625, "epoch": 0.33163265306122447, "grad_norm": 1.0943312644958496, "kl": 0.029257855378091335, "learning_rate": 4.998413080255376e-06, "loss": 0.0012, "reward": 3.823546826839447, "reward_std": 1.6460879147052765, "rewards/_soft_format_reward_func": 1.3500000089406967, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.0743749737739563, "rewards/check_answer": 0.9229219295084476, "step": 65 }, { "completion_length": 857.75, "epoch": 0.336734693877551, "grad_norm": 0.32977959513664246, "kl": 0.03638110449537635, "learning_rate": 4.997840108431203e-06, "loss": 0.0015, "reward": 3.149389237165451, "reward_std": 0.9396539479494095, "rewards/_soft_format_reward_func": 1.3999999910593033, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -1.5449376106262207, "rewards/check_answer": 0.8568266952133854, "step": 66 }, { "completion_length": 538.4375, "epoch": 0.34183673469387754, "grad_norm": 1.0524252653121948, "kl": 0.022415873361751437, "learning_rate": 4.997179041505628e-06, "loss": 0.0009, "reward": 5.703039646148682, "reward_std": 2.7801234126091003, "rewards/_soft_format_reward_func": 1.875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -2.2055625319480896, "rewards/check_answer": 3.408602237701416, "step": 67 }, { "completion_length": 315.5625, "epoch": 0.3469387755102041, "grad_norm": 0.3651297986507416, "kl": 0.028624295257031918, "learning_rate": 4.996429902793494e-06, "loss": 0.0011, "reward": 6.553413987159729, "reward_std": 0.9208076875656843, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.3125624656677246, "rewards/check_answer": 3.1159763261675835, "step": 68 }, { "completion_length": 325.3125, "epoch": 0.3520408163265306, "grad_norm": 0.36629951000213623, "kl": 0.00954329816158861, "learning_rate": 4.995592718715809e-06, "loss": 0.0004, "reward": 4.5701053738594055, "reward_std": 2.1177507576649077, "rewards/_soft_format_reward_func": 1.6875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.0841874927282333, "rewards/check_answer": 1.3417928112321533, "step": 69 }, { "completion_length": 379.25, "epoch": 0.35714285714285715, "grad_norm": 1.3463882207870483, "kl": 0.01597937708720565, "learning_rate": 4.9946675187988104e-06, "loss": 0.0006, "reward": 11.42964482307434, "reward_std": 8.272757766768336, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.378687471151352, "rewards/check_answer": 8.05833314359188, "step": 70 }, { "completion_length": 569.875, "epoch": 0.3622448979591837, "grad_norm": 0.7441303730010986, "kl": 0.014439634280279279, "learning_rate": 4.99365433567292e-06, "loss": 0.0006, "reward": 3.382221519947052, "reward_std": 1.6084937080740929, "rewards/_soft_format_reward_func": 1.649999976158142, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -2.3630625009536743, "rewards/check_answer": 1.4702840596437454, "step": 71 }, { "completion_length": 495.8125, "epoch": 0.3673469387755102, "grad_norm": 0.42513689398765564, "kl": 0.011408616206608713, "learning_rate": 4.992553205071599e-06, "loss": 0.0005, "reward": 11.998928546905518, "reward_std": 6.857654731720686, "rewards/_soft_format_reward_func": 1.793749988079071, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -2.1945624947547913, "rewards/check_answer": 9.77474146336317, "step": 72 }, { "completion_length": 342.8125, "epoch": 0.37244897959183676, "grad_norm": 0.427044153213501, "kl": 0.018415149534121156, "learning_rate": 4.991364165830082e-06, "loss": 0.0007, "reward": 5.027638792991638, "reward_std": 2.4753660559654236, "rewards/_soft_format_reward_func": 1.6500000059604645, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -0.9973750114440918, "rewards/check_answer": 1.562513753771782, "step": 73 }, { "completion_length": 382.0625, "epoch": 0.37755102040816324, "grad_norm": 0.4066709280014038, "kl": 0.011005707492586225, "learning_rate": 4.990087259884016e-06, "loss": 0.0004, "reward": 5.17357063293457, "reward_std": 3.7941238209605217, "rewards/_soft_format_reward_func": 1.793749988079071, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -1.3651875406503677, "rewards/check_answer": 2.3075080066919327, "step": 74 }, { "completion_length": 495.0, "epoch": 0.3826530612244898, "grad_norm": 2.2613179683685303, "kl": 0.01245829276740551, "learning_rate": 4.988722532267969e-06, "loss": 0.0005, "reward": 2.6375007927417755, "reward_std": 3.019617021083832, "rewards/_soft_format_reward_func": 1.581250011920929, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -2.2434374690055847, "rewards/check_answer": 1.049688383936882, "step": 75 }, { "completion_length": 513.625, "epoch": 0.3877551020408163, "grad_norm": 4.0517377853393555, "kl": 0.023214824497699738, "learning_rate": 4.987270031113855e-06, "loss": 0.0009, "reward": 3.0669388473033905, "reward_std": 3.475656270980835, "rewards/_soft_format_reward_func": 1.5625, "rewards/_strict_format_reward_func": 1.875, "rewards/_xml_count_reward_func": -2.354249984025955, "rewards/check_answer": 1.9836888760328293, "step": 76 }, { "completion_length": 391.5625, "epoch": 0.39285714285714285, "grad_norm": 0.8595183491706848, "kl": 0.022897689836099744, "learning_rate": 4.985729807649224e-06, "loss": 0.0009, "reward": 3.036043405532837, "reward_std": 0.7024286016821861, "rewards/_soft_format_reward_func": 1.875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.823375016450882, "rewards/check_answer": 0.35941845644265413, "step": 77 }, { "completion_length": 417.125, "epoch": 0.3979591836734694, "grad_norm": 0.30788978934288025, "kl": 0.016252433881163597, "learning_rate": 4.984101916195467e-06, "loss": 0.0007, "reward": 5.285515308380127, "reward_std": 1.8254318535327911, "rewards/_soft_format_reward_func": 1.6875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.55799999833107, "rewards/check_answer": 2.5310151875019073, "step": 78 }, { "completion_length": 446.8125, "epoch": 0.4030612244897959, "grad_norm": 0.3968336582183838, "kl": 0.01630049163941294, "learning_rate": 4.9823864141658905e-06, "loss": 0.0006, "reward": 537.7010273933411, "reward_std": 489.2284908122383, "rewards/_soft_format_reward_func": 1.4500000029802322, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -1.3780625015497208, "rewards/check_answer": 535.379130016081, "step": 79 }, { "completion_length": 316.0, "epoch": 0.40816326530612246, "grad_norm": 1.0200837850570679, "kl": 0.018812671769410372, "learning_rate": 4.980583362063697e-06, "loss": 0.0008, "reward": 5.375112950801849, "reward_std": 1.6739845629781485, "rewards/_soft_format_reward_func": 1.762499988079071, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -1.0319999903440475, "rewards/check_answer": 2.207113090902567, "step": 80 }, { "completion_length": 320.4375, "epoch": 0.413265306122449, "grad_norm": 0.42193111777305603, "kl": 0.022067378275096416, "learning_rate": 4.978692823479849e-06, "loss": 0.0009, "reward": 6.265072703361511, "reward_std": 1.9197494611144066, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.0499375462532043, "rewards/check_answer": 2.5650103390216827, "step": 81 }, { "completion_length": 374.0, "epoch": 0.41836734693877553, "grad_norm": 0.6050392389297485, "kl": 0.025496677961200476, "learning_rate": 4.976714865090827e-06, "loss": 0.001, "reward": 2.5642066597938538, "reward_std": 1.6043110489845276, "rewards/_soft_format_reward_func": 1.3250000029802322, "rewards/_strict_format_reward_func": 1.875, "rewards/_xml_count_reward_func": -0.9316874668002129, "rewards/check_answer": 0.29589414165820926, "step": 82 }, { "completion_length": 722.0, "epoch": 0.42346938775510207, "grad_norm": 3.9317643642425537, "kl": 0.02777448482811451, "learning_rate": 4.97464955665628e-06, "loss": 0.0011, "reward": 4.469326972961426, "reward_std": 2.0532086789608, "rewards/_soft_format_reward_func": 1.5750000029802322, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.1702499650418758, "rewards/check_answer": 1.4395770141854882, "step": 83 }, { "completion_length": 393.75, "epoch": 0.42857142857142855, "grad_norm": 0.34535089135169983, "kl": 0.030066173058003187, "learning_rate": 4.972496971016559e-06, "loss": 0.0012, "reward": 3.5937094688415527, "reward_std": 1.5248013995587826, "rewards/_soft_format_reward_func": 1.21875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -0.9721875041723251, "rewards/check_answer": 0.7221467904746532, "step": 84 }, { "completion_length": 478.0, "epoch": 0.4336734693877551, "grad_norm": 1.0733822584152222, "kl": 0.01771931373514235, "learning_rate": 4.970257184090156e-06, "loss": 0.0007, "reward": 3.772432804107666, "reward_std": 1.725758384913206, "rewards/_soft_format_reward_func": 1.4937500357627869, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -0.9836250096559525, "rewards/check_answer": 0.8248078285250813, "step": 85 }, { "completion_length": 534.125, "epoch": 0.4387755102040816, "grad_norm": 0.649592936038971, "kl": 0.01605043071322143, "learning_rate": 4.96793027487102e-06, "loss": 0.0006, "reward": 3.0387662947177887, "reward_std": 1.816498763859272, "rewards/_soft_format_reward_func": 1.4749999940395355, "rewards/_strict_format_reward_func": 1.875, "rewards/_xml_count_reward_func": -1.094249963760376, "rewards/check_answer": 0.7830162237514742, "step": 86 }, { "completion_length": 672.1875, "epoch": 0.44387755102040816, "grad_norm": 136.27960205078125, "kl": 0.24366865053889342, "learning_rate": 4.9655163254257755e-06, "loss": 0.0098, "reward": 7.172706127166748, "reward_std": 2.587234117090702, "rewards/_soft_format_reward_func": 1.7000000029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.9572500288486481, "rewards/check_answer": 3.4299561521038413, "step": 87 }, { "completion_length": 694.6875, "epoch": 0.4489795918367347, "grad_norm": 0.23488010466098785, "kl": 0.01088630617596209, "learning_rate": 4.963015420890825e-06, "loss": 0.0004, "reward": 7.021105051040649, "reward_std": 1.0360710583627224, "rewards/_soft_format_reward_func": 1.5499999821186066, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.3521250039339066, "rewards/check_answer": 3.82323000067845, "step": 88 }, { "completion_length": 498.125, "epoch": 0.45408163265306123, "grad_norm": 0.4348633289337158, "kl": 0.018164563458412886, "learning_rate": 4.960427649469346e-06, "loss": 0.0007, "reward": 3.7136881351470947, "reward_std": 2.085390269756317, "rewards/_soft_format_reward_func": 1.4624999910593033, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -1.48831257969141, "rewards/check_answer": 1.302000543102622, "step": 89 }, { "completion_length": 616.875, "epoch": 0.45918367346938777, "grad_norm": 1.5561532974243164, "kl": 0.015912600560113788, "learning_rate": 4.957753102428184e-06, "loss": 0.0006, "reward": 8.220839619636536, "reward_std": 3.1606629248708487, "rewards/_soft_format_reward_func": 1.693750023841858, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.0871875137090683, "rewards/check_answer": 4.80177686011848, "step": 90 }, { "completion_length": 351.3125, "epoch": 0.4642857142857143, "grad_norm": 1.2018077373504639, "kl": 0.06633210554718971, "learning_rate": 4.954991874094633e-06, "loss": 0.0027, "reward": 2.7519712522625923, "reward_std": 1.0931570180691779, "rewards/_soft_format_reward_func": 1.550000011920929, "rewards/_strict_format_reward_func": 1.875, "rewards/_xml_count_reward_func": -1.0370625257492065, "rewards/check_answer": 0.36403384804725647, "step": 91 }, { "completion_length": 366.6875, "epoch": 0.46938775510204084, "grad_norm": 0.42523127794265747, "kl": 0.013800489017739892, "learning_rate": 4.952144061853103e-06, "loss": 0.0006, "reward": 6.534438908100128, "reward_std": 3.0693205446004868, "rewards/_soft_format_reward_func": 1.787500023841858, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.4720624350011349, "rewards/check_answer": 3.4065012373030186, "step": 92 }, { "completion_length": 398.6875, "epoch": 0.4744897959183674, "grad_norm": 0.22197820246219635, "kl": 0.010188436252065003, "learning_rate": 4.949209766141691e-06, "loss": 0.0004, "reward": 4.542325556278229, "reward_std": 1.159326359629631, "rewards/_soft_format_reward_func": 1.918749988079071, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.7997499704360962, "rewards/check_answer": 1.6108255833387375, "step": 93 }, { "completion_length": 500.5, "epoch": 0.47959183673469385, "grad_norm": 0.33497729897499084, "kl": 0.055587747134268284, "learning_rate": 4.946189090448639e-06, "loss": 0.0022, "reward": 4.837452530860901, "reward_std": 1.9922088906168938, "rewards/_soft_format_reward_func": 1.6124999970197678, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.0624999813735485, "rewards/check_answer": 1.6624527087719798, "step": 94 }, { "completion_length": 355.75, "epoch": 0.4846938775510204, "grad_norm": 0.31012672185897827, "kl": 0.0232304020319134, "learning_rate": 4.94308214130868e-06, "loss": 0.0009, "reward": 8.497554540634155, "reward_std": 3.9802782740443945, "rewards/_soft_format_reward_func": 1.5750000029802322, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -0.8483749888837337, "rewards/check_answer": 5.145929626800353, "step": 95 }, { "completion_length": 373.875, "epoch": 0.4897959183673469, "grad_norm": 0.22870931029319763, "kl": 0.012970933690667152, "learning_rate": 4.939889028299284e-06, "loss": 0.0005, "reward": 5.1682655811309814, "reward_std": 1.6821982599794865, "rewards/_soft_format_reward_func": 1.6187499910593033, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.3219375610351562, "rewards/check_answer": 2.0589530132710934, "step": 96 }, { "completion_length": 432.1875, "epoch": 0.49489795918367346, "grad_norm": 0.7341601252555847, "kl": 0.017009504605084658, "learning_rate": 4.936609864036793e-06, "loss": 0.0007, "reward": 5.641974210739136, "reward_std": 1.353297283872962, "rewards/_soft_format_reward_func": 1.90625, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.4101874977350235, "rewards/check_answer": 2.1459116395562887, "step": 97 }, { "completion_length": 363.875, "epoch": 0.5, "grad_norm": 1.3306468725204468, "kl": 0.01631148369051516, "learning_rate": 4.933244764172448e-06, "loss": 0.0007, "reward": 7.02120740711689, "reward_std": 3.146376432850957, "rewards/_soft_format_reward_func": 1.625, "rewards/_strict_format_reward_func": 2.0625, "rewards/_xml_count_reward_func": -0.9313750192523003, "rewards/check_answer": 4.26508229970932, "step": 98 }, { "completion_length": 725.25, "epoch": 0.5051020408163265, "grad_norm": 2.575485944747925, "kl": 0.011819152743555605, "learning_rate": 4.92979384738831e-06, "loss": 0.0005, "reward": 3.9428590536117554, "reward_std": 1.018309948965907, "rewards/_soft_format_reward_func": 1.3749999701976776, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -0.8431875072419643, "rewards/check_answer": 0.5985467173159122, "step": 99 }, { "completion_length": 356.8125, "epoch": 0.5102040816326531, "grad_norm": 0.3401452302932739, "kl": 0.02015103050507605, "learning_rate": 4.926257235393077e-06, "loss": 0.0008, "reward": 9.113705039024353, "reward_std": 5.51456093788147, "rewards/_soft_format_reward_func": 1.7125000059604645, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -1.025812529027462, "rewards/check_answer": 6.177018105983734, "step": 100 }, { "completion_length": 426.4375, "epoch": 0.5153061224489796, "grad_norm": 0.2861900329589844, "kl": 0.008014739083591849, "learning_rate": 4.922635052917786e-06, "loss": 0.0003, "reward": 4.119562268257141, "reward_std": 0.6518542803823948, "rewards/_soft_format_reward_func": 1.7937500178813934, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.2730625346302986, "rewards/check_answer": 0.5988747701048851, "step": 101 }, { "completion_length": 375.75, "epoch": 0.5204081632653061, "grad_norm": 0.37096062302589417, "kl": 0.015424605691805482, "learning_rate": 4.918927427711422e-06, "loss": 0.0006, "reward": 5.0053569078445435, "reward_std": 1.3419223129749298, "rewards/_soft_format_reward_func": 1.7750000059604645, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.221687525510788, "rewards/check_answer": 1.639544501900673, "step": 102 }, { "completion_length": 786.4375, "epoch": 0.5255102040816326, "grad_norm": 0.21075111627578735, "kl": 0.010193536480073817, "learning_rate": 4.915134490536403e-06, "loss": 0.0004, "reward": 7.238903224468231, "reward_std": 1.5646906150504947, "rewards/_soft_format_reward_func": 1.4750000089406967, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.8509999960660934, "rewards/check_answer": 3.614903382266789, "step": 103 }, { "completion_length": 440.1875, "epoch": 0.5306122448979592, "grad_norm": 0.43856188654899597, "kl": 0.023974559269845486, "learning_rate": 4.911256375163977e-06, "loss": 0.001, "reward": 8.465415358543396, "reward_std": 1.3520334959030151, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.5724374949932098, "rewards/check_answer": 5.03785252571106, "step": 104 }, { "completion_length": 744.0625, "epoch": 0.5357142857142857, "grad_norm": 0.19438262283802032, "kl": 0.011386665981262922, "learning_rate": 4.907293218369499e-06, "loss": 0.0005, "reward": 10.853159785270691, "reward_std": 1.569423645734787, "rewards/_soft_format_reward_func": 1.7000000029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.059749960899353, "rewards/check_answer": 7.212910346628632, "step": 105 }, { "completion_length": 395.9375, "epoch": 0.5408163265306123, "grad_norm": 1.1634198427200317, "kl": 0.03389626881107688, "learning_rate": 4.903245159927607e-06, "loss": 0.0014, "reward": 5.962677836418152, "reward_std": 1.3103387877345085, "rewards/_soft_format_reward_func": 1.71875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.1786874681711197, "rewards/check_answer": 2.7976152896881104, "step": 106 }, { "completion_length": 463.9375, "epoch": 0.5459183673469388, "grad_norm": 0.3182987570762634, "kl": 0.014578778180293739, "learning_rate": 4.899112342607296e-06, "loss": 0.0006, "reward": 4.763711512088776, "reward_std": 1.6892382949590683, "rewards/_soft_format_reward_func": 1.6749999970197678, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -1.426062524318695, "rewards/check_answer": 2.264773984483327, "step": 107 }, { "completion_length": 642.125, "epoch": 0.5510204081632653, "grad_norm": 0.3488450348377228, "kl": 0.008207228442188352, "learning_rate": 4.894894912166878e-06, "loss": 0.0003, "reward": 5.196447372436523, "reward_std": 0.5015577161684632, "rewards/_soft_format_reward_func": 1.6625000089406967, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.6586249768733978, "rewards/check_answer": 1.1925724297761917, "step": 108 }, { "completion_length": 458.75, "epoch": 0.5561224489795918, "grad_norm": 0.38955190777778625, "kl": 0.021245236741378903, "learning_rate": 4.890593017348846e-06, "loss": 0.0008, "reward": 3.580578923225403, "reward_std": 1.7503393739461899, "rewards/_soft_format_reward_func": 1.71875, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -1.1833125054836273, "rewards/check_answer": 0.6076414063572884, "step": 109 }, { "completion_length": 421.0625, "epoch": 0.5612244897959183, "grad_norm": 1.2958167791366577, "kl": 0.029316942440345883, "learning_rate": 4.8862068098746246e-06, "loss": 0.0012, "reward": 4.520155489444733, "reward_std": 1.4188951924443245, "rewards/_soft_format_reward_func": 1.899999976158142, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.3512499928474426, "rewards/check_answer": 1.1589055806398392, "step": 110 }, { "completion_length": 386.8125, "epoch": 0.5663265306122449, "grad_norm": 0.3301832377910614, "kl": 0.02814935683272779, "learning_rate": 4.88173644443922e-06, "loss": 0.0011, "reward": 5.029857754707336, "reward_std": 0.8326428681612015, "rewards/_soft_format_reward_func": 1.7000000178813934, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.3943124823272228, "rewards/check_answer": 1.7241703867912292, "step": 111 }, { "completion_length": 678.25, "epoch": 0.5714285714285714, "grad_norm": 0.2823163568973541, "kl": 0.017542321234941483, "learning_rate": 4.877182078705766e-06, "loss": 0.0007, "reward": 4.434657633304596, "reward_std": 1.1893670558929443, "rewards/_soft_format_reward_func": 1.5125000029802322, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -0.6100625060498714, "rewards/check_answer": 1.0947200736713183, "step": 112 }, { "completion_length": 284.3125, "epoch": 0.576530612244898, "grad_norm": 0.36900410056114197, "kl": 0.021452047862112522, "learning_rate": 4.872543873299959e-06, "loss": 0.0009, "reward": 7.099963963031769, "reward_std": 3.859396807849407, "rewards/_soft_format_reward_func": 1.7249999940395355, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -0.8818749785423279, "rewards/check_answer": 3.4443390518426895, "step": 113 }, { "completion_length": 422.75, "epoch": 0.5816326530612245, "grad_norm": 0.29888710379600525, "kl": 0.028071329463273287, "learning_rate": 4.8678219918043984e-06, "loss": 0.0011, "reward": 5.36677348613739, "reward_std": 1.9548562616109848, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.3808125406503677, "rewards/check_answer": 1.9975859224796295, "step": 114 }, { "completion_length": 397.0, "epoch": 0.5867346938775511, "grad_norm": 0.25467562675476074, "kl": 0.020328150130808353, "learning_rate": 4.863016600752813e-06, "loss": 0.0008, "reward": 5.861165881156921, "reward_std": 2.294215776026249, "rewards/_soft_format_reward_func": 1.862500011920929, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.3174375146627426, "rewards/check_answer": 2.503603458404541, "step": 115 }, { "completion_length": 339.0, "epoch": 0.5918367346938775, "grad_norm": 0.4237631559371948, "kl": 0.02294452185742557, "learning_rate": 4.8581278696241924e-06, "loss": 0.0009, "reward": 7.219836235046387, "reward_std": 2.048996329307556, "rewards/_soft_format_reward_func": 1.7750000059604645, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.098249975591898, "rewards/check_answer": 3.543086051940918, "step": 116 }, { "completion_length": 436.75, "epoch": 0.5969387755102041, "grad_norm": 3.5356175899505615, "kl": 0.016916394233703613, "learning_rate": 4.853155970836802e-06, "loss": 0.0007, "reward": 6.2292903661727905, "reward_std": 0.982710599899292, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.5365625321865082, "rewards/check_answer": 2.7658529579639435, "step": 117 }, { "completion_length": 471.375, "epoch": 0.6020408163265306, "grad_norm": 0.39139553904533386, "kl": 0.013290104689076543, "learning_rate": 4.8481010797421106e-06, "loss": 0.0005, "reward": 8.786529064178467, "reward_std": 3.616459548473358, "rewards/_soft_format_reward_func": 1.7000000029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.0753125101327896, "rewards/check_answer": 5.1618416756391525, "step": 118 }, { "completion_length": 353.6875, "epoch": 0.6071428571428571, "grad_norm": 0.28417208790779114, "kl": 0.018675302737392485, "learning_rate": 4.842963374618598e-06, "loss": 0.0007, "reward": 3.775757908821106, "reward_std": 2.131088227033615, "rewards/_soft_format_reward_func": 1.550000011920929, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": -0.9227500185370445, "rewards/check_answer": 0.8985078185796738, "step": 119 }, { "completion_length": 475.625, "epoch": 0.6122448979591837, "grad_norm": 0.4370657205581665, "kl": 0.022615838330239058, "learning_rate": 4.837743036665477e-06, "loss": 0.0009, "reward": 5.187110543251038, "reward_std": 2.9855866506695747, "rewards/_soft_format_reward_func": 1.7125000059604645, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -1.5276250094175339, "rewards/check_answer": 2.3772354535758495, "step": 120 }, { "completion_length": 528.3125, "epoch": 0.6173469387755102, "grad_norm": 0.2892380654811859, "kl": 0.014278996677603573, "learning_rate": 4.832440249996292e-06, "loss": 0.0006, "reward": 25.238910496234894, "reward_std": 10.534590110182762, "rewards/_soft_format_reward_func": 1.59375, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -0.9437499940395355, "rewards/check_answer": 21.963908864553076, "step": 121 }, { "completion_length": 365.875, "epoch": 0.6224489795918368, "grad_norm": 0.28936606645584106, "kl": 0.02362329768948257, "learning_rate": 4.827055201632435e-06, "loss": 0.0009, "reward": 5.751935660839081, "reward_std": 1.4266124442219734, "rewards/_soft_format_reward_func": 1.899999976158142, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -1.2576874569058418, "rewards/check_answer": 2.297123208642006, "step": 122 }, { "completion_length": 389.1875, "epoch": 0.6275510204081632, "grad_norm": 0.3011477589607239, "kl": 0.01705414243042469, "learning_rate": 4.821588081496541e-06, "loss": 0.0007, "reward": 5.534918427467346, "reward_std": 1.3727312982082367, "rewards/_soft_format_reward_func": 1.7000000029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.0863125324249268, "rewards/check_answer": 1.921230850275606, "step": 123 }, { "completion_length": 464.25, "epoch": 0.6326530612244898, "grad_norm": 0.4716140925884247, "kl": 0.06439857231453061, "learning_rate": 4.816039082405799e-06, "loss": 0.0026, "reward": 3.45487904548645, "reward_std": 1.4920125305652618, "rewards/_soft_format_reward_func": 1.4187499731779099, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": -0.788750022649765, "rewards/check_answer": 0.3873790400090229, "step": 124 }, { "completion_length": 458.8125, "epoch": 0.6377551020408163, "grad_norm": 0.2910076975822449, "kl": 0.02810146939009428, "learning_rate": 4.810408400065145e-06, "loss": 0.0011, "reward": 4.422984063625336, "reward_std": 0.6079902481287718, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.777937427163124, "rewards/check_answer": 1.2009213715791702, "step": 125 }, { "completion_length": 418.6875, "epoch": 0.6428571428571429, "grad_norm": 0.5814986228942871, "kl": 0.022671347483992577, "learning_rate": 4.804696233060359e-06, "loss": 0.0009, "reward": 5.096649527549744, "reward_std": 1.7881849110126495, "rewards/_soft_format_reward_func": 1.800000011920929, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -0.7446874678134918, "rewards/check_answer": 1.41633702814579, "step": 126 }, { "completion_length": 360.5625, "epoch": 0.6479591836734694, "grad_norm": 0.36383190751075745, "kl": 0.015463740332052112, "learning_rate": 4.798902782851067e-06, "loss": 0.0006, "reward": 19.118314266204834, "reward_std": 3.782061517238617, "rewards/_soft_format_reward_func": 1.90625, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.36531252786517143, "rewards/check_answer": 14.577375411987305, "step": 127 }, { "completion_length": 265.625, "epoch": 0.6530612244897959, "grad_norm": 0.5543303489685059, "kl": 0.023673945106565952, "learning_rate": 4.793028253763633e-06, "loss": 0.0009, "reward": 6.470711827278137, "reward_std": 1.689875815063715, "rewards/_soft_format_reward_func": 1.8875000178813934, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.6451250202953815, "rewards/check_answer": 2.22833688557148, "step": 128 }, { "completion_length": 580.5, "epoch": 0.6581632653061225, "grad_norm": 0.29141587018966675, "kl": 0.04182523349300027, "learning_rate": 4.7870728529839495e-06, "loss": 0.0017, "reward": 6.982442021369934, "reward_std": 3.280642829835415, "rewards/_soft_format_reward_func": 1.737500011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.9067500084638596, "rewards/check_answer": 3.1516919434070587, "step": 129 }, { "completion_length": 387.25, "epoch": 0.6632653061224489, "grad_norm": 0.2995375394821167, "kl": 0.024216266116127372, "learning_rate": 4.781036790550134e-06, "loss": 0.001, "reward": 190.0288714170456, "reward_std": 97.05669444426894, "rewards/_soft_format_reward_func": 1.84375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -0.8299999982118607, "rewards/check_answer": 186.2026235461235, "step": 130 }, { "completion_length": 385.5, "epoch": 0.6683673469387755, "grad_norm": 0.551918625831604, "kl": 0.022381589747965336, "learning_rate": 4.774920279345121e-06, "loss": 0.0009, "reward": 6.143893599510193, "reward_std": 0.5779884234070778, "rewards/_soft_format_reward_func": 1.662500023841858, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.37799999862909317, "rewards/check_answer": 1.8593935797107406, "step": 131 }, { "completion_length": 432.0, "epoch": 0.673469387755102, "grad_norm": 1.2904788255691528, "kl": 0.03829192137345672, "learning_rate": 4.768723535089156e-06, "loss": 0.0015, "reward": 5.190044522285461, "reward_std": 0.9881309866905212, "rewards/_soft_format_reward_func": 1.7750000357627869, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.3799375146627426, "rewards/check_answer": 0.7949821203947067, "step": 132 }, { "completion_length": 410.8125, "epoch": 0.6785714285714286, "grad_norm": 0.3133583068847656, "kl": 0.028204144444316626, "learning_rate": 4.762446776332179e-06, "loss": 0.0011, "reward": 6.298715114593506, "reward_std": 0.5748582370579243, "rewards/_soft_format_reward_func": 1.831250011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.12068751454353333, "rewards/check_answer": 1.346777692437172, "step": 133 }, { "completion_length": 393.0, "epoch": 0.6836734693877551, "grad_norm": 0.5568116903305054, "kl": 0.030879591591656208, "learning_rate": 4.756090224446127e-06, "loss": 0.0012, "reward": 7.258803009986877, "reward_std": 1.964128702878952, "rewards/_soft_format_reward_func": 1.90625, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.4805000275373459, "rewards/check_answer": 2.83305324614048, "step": 134 }, { "completion_length": 386.125, "epoch": 0.6887755102040817, "grad_norm": 1.3968427181243896, "kl": 0.04453878756612539, "learning_rate": 4.74965410361712e-06, "loss": 0.0018, "reward": 5.945132851600647, "reward_std": 1.0247465334832668, "rewards/_soft_format_reward_func": 1.550000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.4618750140070915, "rewards/check_answer": 1.857007990591228, "step": 135 }, { "completion_length": 396.25, "epoch": 0.6938775510204082, "grad_norm": 0.28240787982940674, "kl": 0.03747776383534074, "learning_rate": 4.7431386408375554e-06, "loss": 0.0015, "reward": 5.879474759101868, "reward_std": 0.6595749771222472, "rewards/_soft_format_reward_func": 1.8875000178813934, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.8629999682307243, "rewards/check_answer": 1.8549747318029404, "step": 136 }, { "completion_length": 606.0, "epoch": 0.6989795918367347, "grad_norm": 0.3750913441181183, "kl": 0.020721438224427402, "learning_rate": 4.736544065898105e-06, "loss": 0.0008, "reward": 4.356168568134308, "reward_std": 1.2325635273009539, "rewards/_soft_format_reward_func": 1.7187499701976776, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.5434374436736107, "rewards/check_answer": 1.180856066319393, "step": 137 }, { "completion_length": 354.6875, "epoch": 0.7040816326530612, "grad_norm": 0.392123281955719, "kl": 0.03661597007885575, "learning_rate": 4.729870611379609e-06, "loss": 0.0015, "reward": 7.636894226074219, "reward_std": 1.705483302474022, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.396000012755394, "rewards/check_answer": 3.107894539833069, "step": 138 }, { "completion_length": 440.375, "epoch": 0.7091836734693877, "grad_norm": 0.3271612226963043, "kl": 0.025694155134260654, "learning_rate": 4.72311851264487e-06, "loss": 0.001, "reward": 14.577420234680176, "reward_std": 2.7595168482512236, "rewards/_soft_format_reward_func": 1.6062500029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.1934374887496233, "rewards/check_answer": 9.777732692658901, "step": 139 }, { "completion_length": 932.3125, "epoch": 0.7142857142857143, "grad_norm": 0.24686619639396667, "kl": 0.017432109219953418, "learning_rate": 4.716288007830357e-06, "loss": 0.0007, "reward": 4.8316367864608765, "reward_std": 1.2224921584129333, "rewards/_soft_format_reward_func": 1.2874999940395355, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.5127499997615814, "rewards/check_answer": 1.0568868009577272, "step": 140 }, { "completion_length": 439.5625, "epoch": 0.7193877551020408, "grad_norm": 0.9186896681785583, "kl": 0.03715210082009435, "learning_rate": 4.709379337837804e-06, "loss": 0.0015, "reward": 5.025137662887573, "reward_std": 1.3306160643696785, "rewards/_soft_format_reward_func": 1.9249999523162842, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -1.1871249936521053, "rewards/check_answer": 1.287262663245201, "step": 141 }, { "completion_length": 705.9375, "epoch": 0.7244897959183674, "grad_norm": 0.36484846472740173, "kl": 0.025597061030566692, "learning_rate": 4.702392746325716e-06, "loss": 0.001, "reward": 6.963650107383728, "reward_std": 1.9291575253009796, "rewards/_soft_format_reward_func": 1.7187499701976776, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.598125021904707, "rewards/check_answer": 2.8430251479148865, "step": 142 }, { "completion_length": 686.375, "epoch": 0.7295918367346939, "grad_norm": 0.3817428946495056, "kl": 0.03906362532870844, "learning_rate": 4.695328479700772e-06, "loss": 0.0016, "reward": 8.219464182853699, "reward_std": 1.2078434526920319, "rewards/_soft_format_reward_func": 1.6250000149011612, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.06687499396502972, "rewards/check_answer": 3.6613386233802885, "step": 143 }, { "completion_length": 475.5625, "epoch": 0.7346938775510204, "grad_norm": 0.3780043125152588, "kl": 0.0657902080565691, "learning_rate": 4.688186787109136e-06, "loss": 0.0026, "reward": 5.480698108673096, "reward_std": 1.339848518371582, "rewards/_soft_format_reward_func": 1.5249999910593033, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.08481250144541264, "rewards/check_answer": 1.0583859297075833, "step": 144 }, { "completion_length": 327.5625, "epoch": 0.7397959183673469, "grad_norm": 0.6541346311569214, "kl": 0.03836058126762509, "learning_rate": 4.680967920427674e-06, "loss": 0.0015, "reward": 7.11588191986084, "reward_std": 0.7152784764766693, "rewards/_soft_format_reward_func": 1.943750023841858, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.11156249791383743, "rewards/check_answer": 2.060569554567337, "step": 145 }, { "completion_length": 280.1875, "epoch": 0.7448979591836735, "grad_norm": 0.3629293441772461, "kl": 0.062112570740282536, "learning_rate": 4.673672134255065e-06, "loss": 0.0025, "reward": 15.467483878135681, "reward_std": 2.8585988879203796, "rewards/_soft_format_reward_func": 1.7562499940395355, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2293124981224537, "rewards/check_answer": 10.481920555233955, "step": 146 }, { "completion_length": 474.25, "epoch": 0.75, "grad_norm": 0.48978060483932495, "kl": 0.04026471124961972, "learning_rate": 4.666299685902823e-06, "loss": 0.0016, "reward": 7.622194051742554, "reward_std": 1.0535718128085136, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.36156249418854713, "rewards/check_answer": 2.33563169836998, "step": 147 }, { "completion_length": 466.375, "epoch": 0.7551020408163265, "grad_norm": 0.30378639698028564, "kl": 0.047573494259268045, "learning_rate": 4.658850835386225e-06, "loss": 0.0019, "reward": 17.233438849449158, "reward_std": 3.6260106414556503, "rewards/_soft_format_reward_func": 1.7562499642372131, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3236249964684248, "rewards/check_answer": 12.153564229607582, "step": 148 }, { "completion_length": 509.625, "epoch": 0.7602040816326531, "grad_norm": 0.3636853098869324, "kl": 0.06674754060804844, "learning_rate": 4.651325845415136e-06, "loss": 0.0027, "reward": 5.467587828636169, "reward_std": 1.2641362864524126, "rewards/_soft_format_reward_func": 1.600000023841858, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.049625005573034286, "rewards/check_answer": 1.0054628625512123, "step": 149 }, { "completion_length": 357.9375, "epoch": 0.7653061224489796, "grad_norm": 0.5053626894950867, "kl": 0.051199947483837605, "learning_rate": 4.6437249813847495e-06, "loss": 0.002, "reward": 11.167248606681824, "reward_std": 4.9293607994914055, "rewards/_soft_format_reward_func": 1.5125000476837158, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2926249988377094, "rewards/check_answer": 6.362123340368271, "step": 150 }, { "completion_length": 378.875, "epoch": 0.7704081632653061, "grad_norm": 0.35232314467430115, "kl": 0.0667138583958149, "learning_rate": 4.636048511366222e-06, "loss": 0.0027, "reward": 6.404476523399353, "reward_std": 0.6765038818120956, "rewards/_soft_format_reward_func": 1.7750000357627869, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.002562493085861206, "rewards/check_answer": 1.6320391297340393, "step": 151 }, { "completion_length": 269.75, "epoch": 0.7755102040816326, "grad_norm": 0.34677934646606445, "kl": 0.09044361300766468, "learning_rate": 4.62829670609722e-06, "loss": 0.0036, "reward": 14.928263902664185, "reward_std": 3.045043110847473, "rewards/_soft_format_reward_func": 1.7750000357627869, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2966874986886978, "rewards/check_answer": 9.856576025485992, "step": 152 }, { "completion_length": 545.4375, "epoch": 0.7806122448979592, "grad_norm": 0.36773860454559326, "kl": 0.02953512966632843, "learning_rate": 4.620469838972374e-06, "loss": 0.0012, "reward": 7.423403382301331, "reward_std": 2.156498059630394, "rewards/_soft_format_reward_func": 1.506250038743019, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.20743750035762787, "rewards/check_answer": 2.8972157298582033, "step": 153 }, { "completion_length": 594.625, "epoch": 0.7857142857142857, "grad_norm": 0.2333289235830307, "kl": 0.025991217233240604, "learning_rate": 4.612568186033633e-06, "loss": 0.001, "reward": 12.17746376991272, "reward_std": 6.597691237926483, "rewards/_soft_format_reward_func": 1.75, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -0.12825000286102295, "rewards/check_answer": 7.74321323633194, "step": 154 }, { "completion_length": 852.8125, "epoch": 0.7908163265306123, "grad_norm": 0.2395699918270111, "kl": 0.03715619241120294, "learning_rate": 4.604592025960531e-06, "loss": 0.0015, "reward": 24.29518300294876, "reward_std": 9.197689248248935, "rewards/_soft_format_reward_func": 1.5500000268220901, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3271874934434891, "rewards/check_answer": 19.417993735411528, "step": 155 }, { "completion_length": 437.0625, "epoch": 0.7959183673469388, "grad_norm": 0.3087191879749298, "kl": 0.05684735253453255, "learning_rate": 4.596541640060358e-06, "loss": 0.0023, "reward": 9.101592540740967, "reward_std": 3.38905222248286, "rewards/_soft_format_reward_func": 1.675000011920929, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.3520000036805868, "rewards/check_answer": 4.262092791497707, "step": 156 }, { "completion_length": 303.1875, "epoch": 0.8010204081632653, "grad_norm": 0.44123315811157227, "kl": 0.06263354513794184, "learning_rate": 4.5884173122582376e-06, "loss": 0.0025, "reward": 11.551234245300293, "reward_std": 1.7726613469421864, "rewards/_soft_format_reward_func": 1.7750000357627869, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.44756248593330383, "rewards/check_answer": 6.328671932220459, "step": 157 }, { "completion_length": 546.875, "epoch": 0.8061224489795918, "grad_norm": 0.3234706521034241, "kl": 0.05165292927995324, "learning_rate": 4.580219329087114e-06, "loss": 0.0021, "reward": 5.810450196266174, "reward_std": 0.6545056030154228, "rewards/_soft_format_reward_func": 1.400000050663948, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2633124962449074, "rewards/check_answer": 1.1471376624685945, "step": 158 }, { "completion_length": 791.625, "epoch": 0.8112244897959183, "grad_norm": 0.28645241260528564, "kl": 0.04510463122278452, "learning_rate": 4.5719479796776466e-06, "loss": 0.0018, "reward": 7.505560338497162, "reward_std": 1.395757220685482, "rewards/_soft_format_reward_func": 1.5500000268220901, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.1236875019967556, "rewards/check_answer": 3.079248049936723, "step": 159 }, { "completion_length": 492.25, "epoch": 0.8163265306122449, "grad_norm": 0.37338629364967346, "kl": 0.052091196179389954, "learning_rate": 4.563603555748015e-06, "loss": 0.0021, "reward": 7.5245548486709595, "reward_std": 1.7396526504307985, "rewards/_soft_format_reward_func": 1.768750011920929, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.12831250205636024, "rewards/check_answer": 2.8149924129247665, "step": 160 }, { "completion_length": 309.375, "epoch": 0.8214285714285714, "grad_norm": 0.3315311670303345, "kl": 0.089169105514884, "learning_rate": 4.555186351593625e-06, "loss": 0.0036, "reward": 7.606752276420593, "reward_std": 1.0673310905694962, "rewards/_soft_format_reward_func": 1.7750000655651093, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.348874993622303, "rewards/check_answer": 2.4828773885965347, "step": 161 }, { "completion_length": 343.6875, "epoch": 0.826530612244898, "grad_norm": 0.38737377524375916, "kl": 0.0651077888906002, "learning_rate": 4.546696664076734e-06, "loss": 0.0026, "reward": 9.475844025611877, "reward_std": 1.092118889093399, "rewards/_soft_format_reward_func": 1.7750000357627869, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3891875073313713, "rewards/check_answer": 4.311656326055527, "step": 162 }, { "completion_length": 462.875, "epoch": 0.8316326530612245, "grad_norm": 0.40601518750190735, "kl": 0.04493711423128843, "learning_rate": 4.538134792615982e-06, "loss": 0.0018, "reward": 7.263669729232788, "reward_std": 2.5759617229923606, "rewards/_soft_format_reward_func": 1.6062500476837158, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.0736250039190054, "rewards/check_answer": 2.5837948471307755, "step": 163 }, { "completion_length": 553.0625, "epoch": 0.8367346938775511, "grad_norm": 0.4406701624393463, "kl": 0.0672515663318336, "learning_rate": 4.529501039175824e-06, "loss": 0.0027, "reward": 7.2119529247283936, "reward_std": 1.1174802966415882, "rewards/_soft_format_reward_func": 1.6812500357627869, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.39887499809265137, "rewards/check_answer": 2.1318282186985016, "step": 164 }, { "completion_length": 396.375, "epoch": 0.8418367346938775, "grad_norm": 0.625912070274353, "kl": 0.06786507740616798, "learning_rate": 4.5207957082558904e-06, "loss": 0.0027, "reward": 21.559359431266785, "reward_std": 6.981817122315988, "rewards/_soft_format_reward_func": 1.7750000357627869, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.39493750035762787, "rewards/check_answer": 16.389422226697206, "step": 165 }, { "completion_length": 341.75, "epoch": 0.8469387755102041, "grad_norm": 1.1860520839691162, "kl": 0.08321556635200977, "learning_rate": 4.51201910688024e-06, "loss": 0.0033, "reward": 7.035177946090698, "reward_std": 1.8561390489339828, "rewards/_soft_format_reward_func": 1.7187500596046448, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.34062499552965164, "rewards/check_answer": 1.9758030250668526, "step": 166 }, { "completion_length": 437.25, "epoch": 0.8520408163265306, "grad_norm": 0.34259557723999023, "kl": 0.0418678093701601, "learning_rate": 4.503171544586535e-06, "loss": 0.0016, "reward": 10.691133677959442, "reward_std": 3.948306621365191, "rewards/_soft_format_reward_func": 1.6062500029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.362125001847744, "rewards/check_answer": 5.722758798219729, "step": 167 }, { "completion_length": 454.4375, "epoch": 0.8571428571428571, "grad_norm": 0.41577261686325073, "kl": 0.03299556393176317, "learning_rate": 4.494253333415125e-06, "loss": 0.0013, "reward": 6.682947874069214, "reward_std": 1.3458359614014626, "rewards/_soft_format_reward_func": 1.6250000298023224, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.35218749567866325, "rewards/check_answer": 1.7057603895664215, "step": 168 }, { "completion_length": 443.1875, "epoch": 0.8622448979591837, "grad_norm": 0.608323335647583, "kl": 0.1554764355532825, "learning_rate": 4.485264787898037e-06, "loss": 0.0062, "reward": 24.002484679222107, "reward_std": 5.166570171713829, "rewards/_soft_format_reward_func": 1.850000023841858, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.47462499141693115, "rewards/check_answer": 18.677860498428345, "step": 169 }, { "completion_length": 529.0, "epoch": 0.8673469387755102, "grad_norm": 0.2710249722003937, "kl": 0.03932965733110905, "learning_rate": 4.476206225047889e-06, "loss": 0.0016, "reward": 5.216221511363983, "reward_std": 0.8351084915921092, "rewards/_soft_format_reward_func": 1.6062500476837158, "rewards/_strict_format_reward_func": 2.4375, "rewards/_xml_count_reward_func": 0.4116249978542328, "rewards/check_answer": 0.760846458375454, "step": 170 }, { "completion_length": 480.5, "epoch": 0.8724489795918368, "grad_norm": 0.46896764636039734, "kl": 0.03708732454106212, "learning_rate": 4.467077964346705e-06, "loss": 0.0015, "reward": 32.737399101257324, "reward_std": 12.719534158706665, "rewards/_soft_format_reward_func": 1.8125, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.36537499353289604, "rewards/check_answer": 27.55952274799347, "step": 171 }, { "completion_length": 564.625, "epoch": 0.8775510204081632, "grad_norm": 0.48409298062324524, "kl": 0.04766590194776654, "learning_rate": 4.457880327734647e-06, "loss": 0.0019, "reward": 7.888592720031738, "reward_std": 1.3809154629707336, "rewards/_soft_format_reward_func": 1.7375000417232513, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.42656249552965164, "rewards/check_answer": 2.7245304584503174, "step": 172 }, { "completion_length": 342.5, "epoch": 0.8826530612244898, "grad_norm": 0.3246302008628845, "kl": 0.06821039691567421, "learning_rate": 4.448613639598664e-06, "loss": 0.0027, "reward": 7.661576509475708, "reward_std": 0.627835601568222, "rewards/_soft_format_reward_func": 1.7375000417232513, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.4233749955892563, "rewards/check_answer": 2.5007017850875854, "step": 173 }, { "completion_length": 531.0625, "epoch": 0.8877551020408163, "grad_norm": 0.5014583468437195, "kl": 0.08092385483905673, "learning_rate": 4.43927822676105e-06, "loss": 0.0034, "reward": 8.200488924980164, "reward_std": 0.6848534047603607, "rewards/_soft_format_reward_func": 1.3250000327825546, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2559374962002039, "rewards/check_answer": 3.61955141882936, "step": 174 }, { "completion_length": 587.25, "epoch": 0.8928571428571429, "grad_norm": 0.5414162874221802, "kl": 0.049171761609613895, "learning_rate": 4.429874418467914e-06, "loss": 0.002, "reward": 15.98888349533081, "reward_std": 2.731475718319416, "rewards/_soft_format_reward_func": 1.8687500357627869, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.6382499784231186, "rewards/check_answer": 11.758384495973587, "step": 175 }, { "completion_length": 373.0625, "epoch": 0.8979591836734694, "grad_norm": 0.27149567008018494, "kl": 0.06143064517527819, "learning_rate": 4.4204025463775715e-06, "loss": 0.0025, "reward": 6.5129474401474, "reward_std": 0.8889782577753067, "rewards/_soft_format_reward_func": 1.6625000834465027, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2829375173896551, "rewards/check_answer": 1.5675101578235626, "step": 176 }, { "completion_length": 540.375, "epoch": 0.9030612244897959, "grad_norm": 0.2515435218811035, "kl": 0.04105467605404556, "learning_rate": 4.410862944548848e-06, "loss": 0.0016, "reward": 6.6056541204452515, "reward_std": 1.832587480545044, "rewards/_soft_format_reward_func": 1.899999976158142, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.0496249720454216, "rewards/check_answer": 1.8435290455818176, "step": 177 }, { "completion_length": 497.4375, "epoch": 0.9081632653061225, "grad_norm": 0.2942586839199066, "kl": 0.0357802826911211, "learning_rate": 4.401255949429299e-06, "loss": 0.0014, "reward": 9.341898918151855, "reward_std": 1.008721500635147, "rewards/_soft_format_reward_func": 1.8125000298023224, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.4579999968409538, "rewards/check_answer": 4.071399033069611, "step": 178 }, { "completion_length": 443.6875, "epoch": 0.9132653061224489, "grad_norm": 0.2456103414297104, "kl": 0.05043966881930828, "learning_rate": 4.391581899843335e-06, "loss": 0.002, "reward": 7.033362329006195, "reward_std": 1.3795625120401382, "rewards/_soft_format_reward_func": 1.4750000089406967, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2459375038743019, "rewards/check_answer": 2.312424931966234, "step": 179 }, { "completion_length": 400.625, "epoch": 0.9183673469387755, "grad_norm": 0.49319222569465637, "kl": 0.05620954278856516, "learning_rate": 4.38184113698028e-06, "loss": 0.0022, "reward": 6.303773522377014, "reward_std": 1.6371471658349037, "rewards/_soft_format_reward_func": 1.7437500059604645, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": 0.0886249914765358, "rewards/check_answer": 1.8463987112045288, "step": 180 }, { "completion_length": 474.875, "epoch": 0.923469387755102, "grad_norm": 0.48986172676086426, "kl": 0.048913688864558935, "learning_rate": 4.372034004382338e-06, "loss": 0.002, "reward": 5.959681272506714, "reward_std": 0.4265642538666725, "rewards/_soft_format_reward_func": 1.850000023841858, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3892499841749668, "rewards/check_answer": 0.7204312160611153, "step": 181 }, { "completion_length": 619.5, "epoch": 0.9285714285714286, "grad_norm": 0.23095013201236725, "kl": 0.03409982565790415, "learning_rate": 4.362160847932473e-06, "loss": 0.0014, "reward": 5.400798320770264, "reward_std": 1.4592021107673645, "rewards/_soft_format_reward_func": 1.5250000208616257, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -0.21550003439188004, "rewards/check_answer": 1.2787983370944858, "step": 182 }, { "completion_length": 629.5625, "epoch": 0.9336734693877551, "grad_norm": 0.24370257556438446, "kl": 0.01954989810474217, "learning_rate": 4.35222201584221e-06, "loss": 0.0008, "reward": 5.626599490642548, "reward_std": 0.4261799646774307, "rewards/_soft_format_reward_func": 1.7000000029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.24112499505281448, "rewards/check_answer": 0.6854746059398167, "step": 183 }, { "completion_length": 322.5, "epoch": 0.9387755102040817, "grad_norm": 0.5464153289794922, "kl": 0.05567363370209932, "learning_rate": 4.3422178586393615e-06, "loss": 0.0022, "reward": 7.5745275020599365, "reward_std": 3.526697516441345, "rewards/_soft_format_reward_func": 1.7187500596046448, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.07387499883770943, "rewards/check_answer": 2.969402402639389, "step": 184 }, { "completion_length": 802.6875, "epoch": 0.9438775510204082, "grad_norm": 0.2673207223415375, "kl": 0.024011684115976095, "learning_rate": 4.332148729155654e-06, "loss": 0.001, "reward": 5.654477477073669, "reward_std": 1.3770648315548897, "rewards/_soft_format_reward_func": 1.5125000178813934, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.07100000604987144, "rewards/check_answer": 1.2129775285720825, "step": 185 }, { "completion_length": 266.4375, "epoch": 0.9489795918367347, "grad_norm": 0.3424183130264282, "kl": 0.07378911692649126, "learning_rate": 4.322014982514292e-06, "loss": 0.003, "reward": 6.338861703872681, "reward_std": 0.6802131589502096, "rewards/_soft_format_reward_func": 1.981249988079071, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.1186249852180481, "rewards/check_answer": 1.2389868646860123, "step": 186 }, { "completion_length": 379.1875, "epoch": 0.9540816326530612, "grad_norm": 0.3071301579475403, "kl": 0.05790677620097995, "learning_rate": 4.3118169761174315e-06, "loss": 0.0023, "reward": 7.1564600467681885, "reward_std": 0.5172314047813416, "rewards/_soft_format_reward_func": 1.8687500059604645, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3855624943971634, "rewards/check_answer": 1.902147650718689, "step": 187 }, { "completion_length": 346.375, "epoch": 0.9591836734693877, "grad_norm": 0.28398242592811584, "kl": 0.05025961343199015, "learning_rate": 4.301555069633571e-06, "loss": 0.002, "reward": 8.402802467346191, "reward_std": 0.9663780927658081, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.4886249974370003, "rewards/check_answer": 2.9891776740550995, "step": 188 }, { "completion_length": 469.5625, "epoch": 0.9642857142857143, "grad_norm": 0.2812504172325134, "kl": 0.036826275289058685, "learning_rate": 4.291229624984876e-06, "loss": 0.0015, "reward": 15.558440685272217, "reward_std": 8.35078378021717, "rewards/_soft_format_reward_func": 1.7000000476837158, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2761249952018261, "rewards/check_answer": 10.58231633901596, "step": 189 }, { "completion_length": 398.0625, "epoch": 0.9693877551020408, "grad_norm": 0.3562508523464203, "kl": 0.049638946540653706, "learning_rate": 4.280841006334403e-06, "loss": 0.002, "reward": 27.97546100616455, "reward_std": 16.727137465029955, "rewards/_soft_format_reward_func": 1.6437500417232513, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.4568124860525131, "rewards/check_answer": 23.062399968504906, "step": 190 }, { "completion_length": 393.0625, "epoch": 0.9744897959183674, "grad_norm": 36.12807083129883, "kl": 0.33482956141233444, "learning_rate": 4.270389580073264e-06, "loss": 0.0134, "reward": 7.393812417984009, "reward_std": 1.9795853942632675, "rewards/_soft_format_reward_func": 1.8875000476837158, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.12450001761317253, "rewards/check_answer": 2.6308123022317886, "step": 191 }, { "completion_length": 295.5625, "epoch": 0.9795918367346939, "grad_norm": 0.2944537401199341, "kl": 0.04368643742054701, "learning_rate": 4.2598757148076996e-06, "loss": 0.0017, "reward": 11.020132422447205, "reward_std": 2.0972883477807045, "rewards/_soft_format_reward_func": 1.962499976158142, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.26499998942017555, "rewards/check_answer": 5.792632728815079, "step": 192 }, { "completion_length": 364.6875, "epoch": 0.9846938775510204, "grad_norm": 0.1708153784275055, "kl": 0.029444904066622257, "learning_rate": 4.249299781346086e-06, "loss": 0.0012, "reward": 6.42504096031189, "reward_std": 0.45998556911945343, "rewards/_soft_format_reward_func": 1.6250000149011612, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3477499932050705, "rewards/check_answer": 1.4522912870161235, "step": 193 }, { "completion_length": 297.3125, "epoch": 0.9897959183673469, "grad_norm": 0.3687419593334198, "kl": 0.04513590410351753, "learning_rate": 4.2386621526858465e-06, "loss": 0.0018, "reward": 7.11621256172657, "reward_std": 1.6567531460896134, "rewards/_soft_format_reward_func": 1.731249988079071, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": 0.304749995470047, "rewards/check_answer": 2.8302126228809357, "step": 194 }, { "completion_length": 399.3125, "epoch": 0.9948979591836735, "grad_norm": 0.3162878453731537, "kl": 0.036621647188439965, "learning_rate": 4.227963204000305e-06, "loss": 0.0015, "reward": 129.70068180561066, "reward_std": 113.08588391542435, "rewards/_soft_format_reward_func": 1.831250011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.2776250094175339, "rewards/check_answer": 125.14704971015453, "step": 195 }, { "completion_length": 360.3125, "epoch": 1.0, "grad_norm": 0.3047390878200531, "kl": 0.0588951304089278, "learning_rate": 4.217203312625453e-06, "loss": 0.0024, "reward": 8.42742919921875, "reward_std": 2.6744541078805923, "rewards/_soft_format_reward_func": 1.887499988079071, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.05043748766183853, "rewards/check_answer": 3.4894914776086807, "step": 196 }, { "completion_length": 341.0625, "epoch": 1.0051020408163265, "grad_norm": 0.31391653418540955, "kl": 0.04442111076787114, "learning_rate": 4.206382858046636e-06, "loss": 0.0018, "reward": 7.735453367233276, "reward_std": 1.065683752298355, "rewards/_soft_format_reward_func": 1.8312499821186066, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.43312498927116394, "rewards/check_answer": 2.4710785150527954, "step": 197 }, { "completion_length": 375.3125, "epoch": 1.010204081632653, "grad_norm": 0.4202650785446167, "kl": 0.04309396957978606, "learning_rate": 4.195502221885176e-06, "loss": 0.0017, "reward": 10.156509637832642, "reward_std": 1.2403309643268585, "rewards/_soft_format_reward_func": 1.9625000059604645, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.4984999895095825, "rewards/check_answer": 4.695509642362595, "step": 198 }, { "completion_length": 557.8125, "epoch": 1.0153061224489797, "grad_norm": 0.411990761756897, "kl": 0.04207156877964735, "learning_rate": 4.184561787884911e-06, "loss": 0.0017, "reward": 6.723345756530762, "reward_std": 0.40229716151952744, "rewards/_soft_format_reward_func": 1.981249988079071, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.492374986410141, "rewards/check_answer": 1.249720811843872, "step": 199 }, { "completion_length": 317.875, "epoch": 1.0204081632653061, "grad_norm": 0.5010614991188049, "kl": 0.09044251404702663, "learning_rate": 4.173561941898656e-06, "loss": 0.0036, "reward": 6.473593235015869, "reward_std": 1.4734688764438033, "rewards/_soft_format_reward_func": 1.7750000059604645, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.31718749552965164, "rewards/check_answer": 1.381405621767044, "step": 200 }, { "completion_length": 452.0625, "epoch": 1.0255102040816326, "grad_norm": 0.3681066334247589, "kl": 0.03809668601024896, "learning_rate": 4.162503071874603e-06, "loss": 0.0015, "reward": 6.502772688865662, "reward_std": 1.3066000789403915, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.38712503761053085, "rewards/check_answer": 1.8898976296186447, "step": 201 }, { "completion_length": 423.75, "epoch": 1.030612244897959, "grad_norm": 0.19629111886024475, "kl": 0.046010272577404976, "learning_rate": 4.151385567842629e-06, "loss": 0.0018, "reward": 399.35719668865204, "reward_std": 279.76188530772924, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.14612499251961708, "rewards/check_answer": 394.21108666062355, "step": 202 }, { "completion_length": 338.9375, "epoch": 1.0357142857142858, "grad_norm": 0.34913596510887146, "kl": 0.05951223103329539, "learning_rate": 4.140209821900548e-06, "loss": 0.0024, "reward": 5.7086756229400635, "reward_std": 1.2880188524723053, "rewards/_soft_format_reward_func": 1.8812499642372131, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.23206248879432678, "rewards/check_answer": 0.7828629612922668, "step": 203 }, { "completion_length": 342.0625, "epoch": 1.0408163265306123, "grad_norm": 0.3435830771923065, "kl": 0.04380645975470543, "learning_rate": 4.12897622820028e-06, "loss": 0.0018, "reward": 8.00225567817688, "reward_std": 1.523213267326355, "rewards/_soft_format_reward_func": 1.8875000178813934, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.36687498912215233, "rewards/check_answer": 2.7478803396224976, "step": 204 }, { "completion_length": 356.125, "epoch": 1.0459183673469388, "grad_norm": 0.36134347319602966, "kl": 0.04655326111242175, "learning_rate": 4.117685182933947e-06, "loss": 0.0019, "reward": 6.2672423124313354, "reward_std": 0.3675787951797247, "rewards/_soft_format_reward_func": 1.943750023841858, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.47449998557567596, "rewards/check_answer": 0.8489925488829613, "step": 205 }, { "completion_length": 577.4375, "epoch": 1.0510204081632653, "grad_norm": 0.4111148416996002, "kl": 0.049234330188483, "learning_rate": 4.106337084319904e-06, "loss": 0.002, "reward": 7.752165675163269, "reward_std": 0.4733734019100666, "rewards/_soft_format_reward_func": 1.6625000089406967, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3658749908208847, "rewards/check_answer": 2.7237908765673637, "step": 206 }, { "completion_length": 308.375, "epoch": 1.0561224489795917, "grad_norm": 0.5833783745765686, "kl": 0.05842061527073383, "learning_rate": 4.094932332588693e-06, "loss": 0.0023, "reward": 6.586714863777161, "reward_std": 2.829386033117771, "rewards/_soft_format_reward_func": 1.7562499940395355, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.3178749941289425, "rewards/check_answer": 1.7000898569822311, "step": 207 }, { "completion_length": 331.625, "epoch": 1.0612244897959184, "grad_norm": 0.29140666127204895, "kl": 0.03743181750178337, "learning_rate": 4.083471329968926e-06, "loss": 0.0015, "reward": 8.740661025047302, "reward_std": 0.7558915354311466, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 3.2396610528230667, "step": 208 }, { "completion_length": 302.6875, "epoch": 1.066326530612245, "grad_norm": 0.4549320638179779, "kl": 0.05644373595714569, "learning_rate": 4.071954480673098e-06, "loss": 0.0023, "reward": 6.7335838079452515, "reward_std": 0.6734579056501389, "rewards/_soft_format_reward_func": 1.943750023841858, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.373687494546175, "rewards/check_answer": 1.4161463677883148, "step": 209 }, { "completion_length": 346.375, "epoch": 1.0714285714285714, "grad_norm": 0.3305790424346924, "kl": 0.04994491580873728, "learning_rate": 4.0603821908833386e-06, "loss": 0.002, "reward": 9.235015630722046, "reward_std": 1.2825317829847336, "rewards/_soft_format_reward_func": 1.981249988079071, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.2917499840259552, "rewards/check_answer": 3.962015599012375, "step": 210 }, { "completion_length": 542.125, "epoch": 1.0765306122448979, "grad_norm": 0.22213147580623627, "kl": 0.026279668672941625, "learning_rate": 4.048754868737075e-06, "loss": 0.0011, "reward": 22.33821666240692, "reward_std": 1.0081460773944855, "rewards/_soft_format_reward_func": 1.7000000029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.37574999034404755, "rewards/check_answer": 17.26246675942093, "step": 211 }, { "completion_length": 325.25, "epoch": 1.0816326530612246, "grad_norm": 0.33424288034439087, "kl": 0.04722048016265035, "learning_rate": 4.037072924312649e-06, "loss": 0.0019, "reward": 7.3233397006988525, "reward_std": 1.6336696669459343, "rewards/_soft_format_reward_func": 1.800000011920929, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.46449999511241913, "rewards/check_answer": 2.246339738368988, "step": 212 }, { "completion_length": 380.25, "epoch": 1.086734693877551, "grad_norm": 0.3298036456108093, "kl": 0.03231387445703149, "learning_rate": 4.0253367696148435e-06, "loss": 0.0013, "reward": 10.010229587554932, "reward_std": 2.455143466591835, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 4.509229928255081, "step": 213 }, { "completion_length": 449.25, "epoch": 1.0918367346938775, "grad_norm": 0.2562406063079834, "kl": 0.03419307968579233, "learning_rate": 4.013546818560362e-06, "loss": 0.0014, "reward": 8.042372584342957, "reward_std": 0.751374788582325, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.49318748712539673, "rewards/check_answer": 2.5491852164268494, "step": 214 }, { "completion_length": 345.8125, "epoch": 1.096938775510204, "grad_norm": 0.2993167042732239, "kl": 0.03938100393861532, "learning_rate": 4.001703486963223e-06, "loss": 0.0016, "reward": 13.524673461914062, "reward_std": 3.062700480222702, "rewards/_soft_format_reward_func": 1.962499976158142, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.48374998569488525, "rewards/check_answer": 8.078423976898193, "step": 215 }, { "completion_length": 579.0, "epoch": 1.1020408163265305, "grad_norm": 0.23525753617286682, "kl": 0.038901340682059526, "learning_rate": 3.989807192520098e-06, "loss": 0.0016, "reward": 11.004587292671204, "reward_std": 3.6870256178081036, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.1887500286102295, "rewards/check_answer": 6.193337500095367, "step": 216 }, { "completion_length": 354.375, "epoch": 1.1071428571428572, "grad_norm": 0.4108511209487915, "kl": 0.049589950358495116, "learning_rate": 3.9778583547955765e-06, "loss": 0.002, "reward": 7.848548054695129, "reward_std": 0.9341182895004749, "rewards/_soft_format_reward_func": 1.90625, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.4646874964237213, "rewards/check_answer": 2.4776105992496014, "step": 217 }, { "completion_length": 239.3125, "epoch": 1.1122448979591837, "grad_norm": 0.37700262665748596, "kl": 0.07110750861465931, "learning_rate": 3.965857395207375e-06, "loss": 0.0028, "reward": 6.046079754829407, "reward_std": 0.22055783914402127, "rewards/_soft_format_reward_func": 1.9437499642372131, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.4906249940395355, "rewards/check_answer": 0.6117046624422073, "step": 218 }, { "completion_length": 391.75, "epoch": 1.1173469387755102, "grad_norm": 0.32394155859947205, "kl": 0.04216121416538954, "learning_rate": 3.9538047370114695e-06, "loss": 0.0017, "reward": 11.35419249534607, "reward_std": 2.7961763814091682, "rewards/_soft_format_reward_func": 1.9437499940395355, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.47987498342990875, "rewards/check_answer": 5.930567711591721, "step": 219 }, { "completion_length": 517.3125, "epoch": 1.1224489795918366, "grad_norm": 0.4214778244495392, "kl": 0.07988983625546098, "learning_rate": 3.941700805287169e-06, "loss": 0.0032, "reward": 8.18191134929657, "reward_std": 1.6381587088108063, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.4880624860525131, "rewards/check_answer": 2.768848918378353, "step": 220 }, { "completion_length": 396.625, "epoch": 1.1275510204081634, "grad_norm": 0.29704153537750244, "kl": 0.04481638455763459, "learning_rate": 3.92954602692212e-06, "loss": 0.0018, "reward": 11.367891311645508, "reward_std": 3.2973266541957855, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.29612497985363007, "rewards/check_answer": 6.071766555309296, "step": 221 }, { "completion_length": 589.9375, "epoch": 1.1326530612244898, "grad_norm": 0.19876539707183838, "kl": 0.019044322660192847, "learning_rate": 3.9173408305972606e-06, "loss": 0.0008, "reward": 104.04097664356232, "reward_std": 30.219659864902496, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.49318748712539673, "rewards/check_answer": 98.79778736829758, "step": 222 }, { "completion_length": 368.25, "epoch": 1.1377551020408163, "grad_norm": 0.35037973523139954, "kl": 0.04351036436855793, "learning_rate": 3.905085646771689e-06, "loss": 0.0017, "reward": 19.659468710422516, "reward_std": 5.726649031043053, "rewards/_soft_format_reward_func": 1.875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": 0.3203124888241291, "rewards/check_answer": 14.83915638923645, "step": 223 }, { "completion_length": 388.0625, "epoch": 1.1428571428571428, "grad_norm": 0.3024386465549469, "kl": 0.036022431682795286, "learning_rate": 3.892780907667495e-06, "loss": 0.0014, "reward": 8.131965398788452, "reward_std": 0.9055671244859695, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 2.630965530872345, "step": 224 }, { "completion_length": 382.25, "epoch": 1.1479591836734695, "grad_norm": 0.3158092796802521, "kl": 0.04636579938232899, "learning_rate": 3.880427047254502e-06, "loss": 0.0019, "reward": 8.08501136302948, "reward_std": 0.7919884920120239, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 2.584011249244213, "step": 225 }, { "completion_length": 581.0, "epoch": 1.153061224489796, "grad_norm": 0.22562365233898163, "kl": 0.03268377063795924, "learning_rate": 3.868024501234972e-06, "loss": 0.0013, "reward": 7.202247619628906, "reward_std": 3.4254866242408752, "rewards/_soft_format_reward_func": 1.8125, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": 0.026124969124794006, "rewards/check_answer": 2.738622672855854, "step": 226 }, { "completion_length": 336.25, "epoch": 1.1581632653061225, "grad_norm": 0.34813469648361206, "kl": 0.04001564159989357, "learning_rate": 3.855573707028239e-06, "loss": 0.0016, "reward": 8.867348909378052, "reward_std": 1.2533812262117863, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 3.366348847746849, "step": 227 }, { "completion_length": 486.375, "epoch": 1.163265306122449, "grad_norm": 0.27957627177238464, "kl": 0.03200881229713559, "learning_rate": 3.843075103755273e-06, "loss": 0.0013, "reward": 10.206645011901855, "reward_std": 2.00884972512722, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.18962496891617775, "rewards/check_answer": 5.017019867897034, "step": 228 }, { "completion_length": 523.25, "epoch": 1.1683673469387754, "grad_norm": 0.7269229888916016, "kl": 0.05936818476766348, "learning_rate": 3.830529132223202e-06, "loss": 0.0024, "reward": 6.077142119407654, "reward_std": 0.9461775571107864, "rewards/_soft_format_reward_func": 1.7000000029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.18649998307228088, "rewards/check_answer": 1.190642079411191, "step": 229 }, { "completion_length": 485.6875, "epoch": 1.1734693877551021, "grad_norm": 0.2921268343925476, "kl": 0.03041125787422061, "learning_rate": 3.817936234909763e-06, "loss": 0.0012, "reward": 6.2724268436431885, "reward_std": 1.5871776547282934, "rewards/_soft_format_reward_func": 1.918749988079071, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.10399998724460602, "rewards/check_answer": 1.4371768236160278, "step": 230 }, { "completion_length": 516.3125, "epoch": 1.1785714285714286, "grad_norm": 2.598008394241333, "kl": 0.04260433139279485, "learning_rate": 3.80529685594769e-06, "loss": 0.0017, "reward": 12.280081391334534, "reward_std": 2.9517072029411793, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.23993748426437378, "rewards/check_answer": 7.040144145488739, "step": 231 }, { "completion_length": 395.0, "epoch": 1.183673469387755, "grad_norm": 0.35654065012931824, "kl": 0.03489643894135952, "learning_rate": 3.792611441109063e-06, "loss": 0.0014, "reward": 11.359565138816833, "reward_std": 2.3292928487062454, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3761249929666519, "rewards/check_answer": 5.983440205454826, "step": 232 }, { "completion_length": 401.9375, "epoch": 1.1887755102040816, "grad_norm": 0.2584652602672577, "kl": 0.02747915661893785, "learning_rate": 3.779880437789574e-06, "loss": 0.0011, "reward": 8.458192110061646, "reward_std": 0.5072742262855172, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.46968749165534973, "rewards/check_answer": 3.0635048747062683, "step": 233 }, { "completion_length": 447.5, "epoch": 1.193877551020408, "grad_norm": 40.201622009277344, "kl": 2.7213867825921625, "learning_rate": 3.767104294992754e-06, "loss": 0.1089, "reward": 37.41048204898834, "reward_std": 6.400567984208465, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 31.90947988629341, "step": 234 }, { "completion_length": 780.875, "epoch": 1.1989795918367347, "grad_norm": 0.2753540575504303, "kl": 0.021944483974948525, "learning_rate": 3.7542834633141345e-06, "loss": 0.0009, "reward": 7.129895329475403, "reward_std": 1.297385048121214, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": -0.01712501049041748, "rewards/check_answer": 2.1470203548669815, "step": 235 }, { "completion_length": 341.75, "epoch": 1.2040816326530612, "grad_norm": 0.34806281328201294, "kl": 0.0429367832839489, "learning_rate": 3.7414183949253614e-06, "loss": 0.0017, "reward": 6.452441692352295, "reward_std": 2.0902061354136094, "rewards/_soft_format_reward_func": 1.806249976158142, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.18718747794628143, "rewards/check_answer": 1.6465042941272259, "step": 236 }, { "completion_length": 499.5625, "epoch": 1.2091836734693877, "grad_norm": 0.2900775372982025, "kl": 0.039164841175079346, "learning_rate": 3.728509543558239e-06, "loss": 0.0016, "reward": 6.349876523017883, "reward_std": 1.9654560983181, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -0.029812529683113098, "rewards/check_answer": 1.629688948392868, "step": 237 }, { "completion_length": 440.875, "epoch": 1.2142857142857142, "grad_norm": 1.1023831367492676, "kl": 0.04508004803210497, "learning_rate": 3.715557364488735e-06, "loss": 0.0018, "reward": 7.34362006187439, "reward_std": 0.501777783036232, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 1.8426202535629272, "step": 238 }, { "completion_length": 521.75, "epoch": 1.219387755102041, "grad_norm": 0.18328067660331726, "kl": 0.03002178471069783, "learning_rate": 3.7025623145209196e-06, "loss": 0.0012, "reward": 21.852468729019165, "reward_std": 8.15756268799305, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.33924998715519905, "rewards/check_answer": 16.513219088315964, "step": 239 }, { "completion_length": 423.5625, "epoch": 1.2244897959183674, "grad_norm": 0.27355238795280457, "kl": 0.03441121755167842, "learning_rate": 3.6895248519708552e-06, "loss": 0.0014, "reward": 9.248181223869324, "reward_std": 5.668044149875641, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.08843748271465302, "rewards/check_answer": 4.409743905067444, "step": 240 }, { "completion_length": 289.1875, "epoch": 1.2295918367346939, "grad_norm": 0.43641212582588196, "kl": 0.05587594583630562, "learning_rate": 3.676445436650435e-06, "loss": 0.0022, "reward": 7.713133692741394, "reward_std": 1.4005136042833328, "rewards/_soft_format_reward_func": 1.881250023841858, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.4282499924302101, "rewards/check_answer": 2.5911336839199066, "step": 241 }, { "completion_length": 484.1875, "epoch": 1.2346938775510203, "grad_norm": 0.29936179518699646, "kl": 0.03446671739220619, "learning_rate": 3.6633245298511615e-06, "loss": 0.0014, "reward": 5.435900092124939, "reward_std": 0.9865807015448809, "rewards/_soft_format_reward_func": 1.7124999910593033, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.07593749463558197, "rewards/check_answer": 0.834962572902441, "step": 242 }, { "completion_length": 249.8125, "epoch": 1.239795918367347, "grad_norm": 0.3473406732082367, "kl": 0.041182656306773424, "learning_rate": 3.650162594327881e-06, "loss": 0.0016, "reward": 32.44585049152374, "reward_std": 17.82753943838179, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.38524999003857374, "rewards/check_answer": 27.06060168892145, "step": 243 }, { "completion_length": 373.5625, "epoch": 1.2448979591836735, "grad_norm": 0.6561787724494934, "kl": 0.04421532340347767, "learning_rate": 3.636960094282461e-06, "loss": 0.0018, "reward": 10.547720432281494, "reward_std": 2.3654505601152778, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 5.046720087528229, "step": 244 }, { "completion_length": 312.4375, "epoch": 1.25, "grad_norm": 0.401877224445343, "kl": 0.04252156801521778, "learning_rate": 3.62371749534742e-06, "loss": 0.0017, "reward": 176.37635481357574, "reward_std": 187.44706455618143, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 170.87536078691483, "step": 245 }, { "completion_length": 546.125, "epoch": 1.2551020408163265, "grad_norm": 0.4822182059288025, "kl": 0.0404973141849041, "learning_rate": 3.610435264569506e-06, "loss": 0.0016, "reward": 5.919567108154297, "reward_std": 0.9037803895771503, "rewards/_soft_format_reward_func": 1.7750000059604645, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.40706248953938484, "rewards/check_answer": 0.7375045046210289, "step": 246 }, { "completion_length": 335.0, "epoch": 1.260204081632653, "grad_norm": 1.0417882204055786, "kl": 0.058931102976202965, "learning_rate": 3.59711387039322e-06, "loss": 0.0024, "reward": 7.960751295089722, "reward_std": 1.9481116998940706, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.40937498956918716, "rewards/check_answer": 2.551376521587372, "step": 247 }, { "completion_length": 501.9375, "epoch": 1.2653061224489797, "grad_norm": 0.3014232814311981, "kl": 0.02945040026679635, "learning_rate": 3.5837537826442996e-06, "loss": 0.0012, "reward": 6.0213092267513275, "reward_std": 3.0799474716186523, "rewards/_soft_format_reward_func": 1.5625, "rewards/_strict_format_reward_func": 1.875, "rewards/_xml_count_reward_func": 0.2953749932348728, "rewards/check_answer": 2.2884344458580017, "step": 248 }, { "completion_length": 433.8125, "epoch": 1.2704081632653061, "grad_norm": 0.3146630525588989, "kl": 0.04995664907619357, "learning_rate": 3.570355472513148e-06, "loss": 0.002, "reward": 6.550642013549805, "reward_std": 1.1807164400815964, "rewards/_soft_format_reward_func": 1.918749988079071, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.4178124964237213, "rewards/check_answer": 1.4015794694423676, "step": 249 }, { "completion_length": 399.5625, "epoch": 1.2755102040816326, "grad_norm": 0.7177777290344238, "kl": 0.05952198896557093, "learning_rate": 3.5569194125382122e-06, "loss": 0.0024, "reward": 8.12901496887207, "reward_std": 2.611573375761509, "rewards/_soft_format_reward_func": 1.856249988079071, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": 0.20737500488758087, "rewards/check_answer": 3.440389961004257, "step": 250 }, { "completion_length": 513.25, "epoch": 1.280612244897959, "grad_norm": 0.32417553663253784, "kl": 0.02895202673971653, "learning_rate": 3.543446076589323e-06, "loss": 0.0012, "reward": 11.36902403831482, "reward_std": 1.7574745267629623, "rewards/_soft_format_reward_func": 1.850000023841858, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.21918749064207077, "rewards/check_answer": 6.299836695194244, "step": 251 }, { "completion_length": 356.75, "epoch": 1.2857142857142856, "grad_norm": 0.3084201216697693, "kl": 0.03097515576519072, "learning_rate": 3.529935939850977e-06, "loss": 0.0012, "reward": 9.016826748847961, "reward_std": 0.7636192254722118, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 3.515826642513275, "step": 252 }, { "completion_length": 341.125, "epoch": 1.2908163265306123, "grad_norm": 0.25116291642189026, "kl": 0.02968740649521351, "learning_rate": 3.516389478805581e-06, "loss": 0.0012, "reward": 7.705106616020203, "reward_std": 1.4544169902801514, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3708124943077564, "rewards/check_answer": 2.4092941842973232, "step": 253 }, { "completion_length": 387.8125, "epoch": 1.2959183673469388, "grad_norm": 0.2992796003818512, "kl": 0.044889158103615046, "learning_rate": 3.5028071712166456e-06, "loss": 0.0018, "reward": 7.431437849998474, "reward_std": 1.1658359989523888, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.36274998635053635, "rewards/check_answer": 2.3186877369880676, "step": 254 }, { "completion_length": 463.3125, "epoch": 1.3010204081632653, "grad_norm": 1.0446410179138184, "kl": 0.13644460123032331, "learning_rate": 3.4891894961119367e-06, "loss": 0.0055, "reward": 7.609053730964661, "reward_std": 1.3696298897266388, "rewards/_soft_format_reward_func": 1.981249988079071, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.13462497293949127, "rewards/check_answer": 2.4931787848472595, "step": 255 }, { "completion_length": 313.1875, "epoch": 1.306122448979592, "grad_norm": 1.3470245599746704, "kl": 0.05142315570265055, "learning_rate": 3.4755369337665767e-06, "loss": 0.0021, "reward": 10.27379596233368, "reward_std": 1.2127346321940422, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 4.772795736789703, "step": 256 }, { "completion_length": 479.375, "epoch": 1.3112244897959184, "grad_norm": 0.2921811640262604, "kl": 0.03002007771283388, "learning_rate": 3.4618499656861127e-06, "loss": 0.0012, "reward": 9.079193353652954, "reward_std": 2.8739907946437597, "rewards/_soft_format_reward_func": 1.75, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.46931249648332596, "rewards/check_answer": 4.047380834817886, "step": 257 }, { "completion_length": 487.0625, "epoch": 1.316326530612245, "grad_norm": 0.30052751302719116, "kl": 0.02756687719374895, "learning_rate": 3.448129074589529e-06, "loss": 0.0011, "reward": 8.22206735610962, "reward_std": 1.0395818054676056, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 2.721067249774933, "step": 258 }, { "completion_length": 566.4375, "epoch": 1.3214285714285714, "grad_norm": 1.3980598449707031, "kl": 0.04097011568956077, "learning_rate": 3.4343747443922253e-06, "loss": 0.0016, "reward": 74.7005969285965, "reward_std": 65.19187147170305, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.1017499715089798, "rewards/check_answer": 69.59884896874428, "step": 259 }, { "completion_length": 561.625, "epoch": 1.3265306122448979, "grad_norm": 2.238983392715454, "kl": 0.0385152967646718, "learning_rate": 3.4205874601889465e-06, "loss": 0.0015, "reward": 11.257536888122559, "reward_std": 2.0906684398651123, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.46968749165534973, "rewards/check_answer": 5.862849414348602, "step": 260 }, { "completion_length": 360.25, "epoch": 1.3316326530612246, "grad_norm": 0.4498765468597412, "kl": 0.04135388555005193, "learning_rate": 3.4067677082366795e-06, "loss": 0.0017, "reward": 12.972948431968689, "reward_std": 3.2377343624830246, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.30162498354911804, "rewards/check_answer": 7.6713235676288605, "step": 261 }, { "completion_length": 541.0625, "epoch": 1.336734693877551, "grad_norm": 0.2704751491546631, "kl": 0.036466233897954226, "learning_rate": 3.3929159759374963e-06, "loss": 0.0015, "reward": 9.328784704208374, "reward_std": 2.759569361805916, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.11506250500679016, "rewards/check_answer": 4.28872212767601, "step": 262 }, { "completion_length": 305.4375, "epoch": 1.3418367346938775, "grad_norm": 0.37358084321022034, "kl": 0.06474325619637966, "learning_rate": 3.3790327518213705e-06, "loss": 0.0026, "reward": 18.249707102775574, "reward_std": 10.445075172930956, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.46968749165534973, "rewards/check_answer": 12.85502003878355, "step": 263 }, { "completion_length": 299.375, "epoch": 1.346938775510204, "grad_norm": 0.5274214148521423, "kl": 0.06182891130447388, "learning_rate": 3.3651185255289466e-06, "loss": 0.0025, "reward": 6.380192518234253, "reward_std": 0.7590235061943531, "rewards/_soft_format_reward_func": 1.8875000178813934, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.358062494546175, "rewards/check_answer": 1.1346299946308136, "step": 264 }, { "completion_length": 355.3125, "epoch": 1.3520408163265305, "grad_norm": 1.733555793762207, "kl": 0.05918777082115412, "learning_rate": 3.351173787794265e-06, "loss": 0.0024, "reward": 7.078450918197632, "reward_std": 0.6762270703911781, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.38624998554587364, "rewards/check_answer": 1.6922010779380798, "step": 265 }, { "completion_length": 668.6875, "epoch": 1.3571428571428572, "grad_norm": 0.29560568928718567, "kl": 0.033541878685355186, "learning_rate": 3.3371990304274654e-06, "loss": 0.0013, "reward": 5.593075513839722, "reward_std": 2.103371325880289, "rewards/_soft_format_reward_func": 1.875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": -0.19037500023841858, "rewards/check_answer": 1.2834507524967194, "step": 266 }, { "completion_length": 334.5, "epoch": 1.3622448979591837, "grad_norm": 0.32789790630340576, "kl": 0.043092924170196056, "learning_rate": 3.3231947462974314e-06, "loss": 0.0017, "reward": 9.766201496124268, "reward_std": 1.6006742417812347, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3867499902844429, "rewards/check_answer": 4.379451096057892, "step": 267 }, { "completion_length": 331.9375, "epoch": 1.3673469387755102, "grad_norm": 0.3861054480075836, "kl": 0.04949623066931963, "learning_rate": 3.3091614293144103e-06, "loss": 0.002, "reward": 6.758515477180481, "reward_std": 0.5750819966197014, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 1.2575156837701797, "step": 268 }, { "completion_length": 466.4375, "epoch": 1.3724489795918369, "grad_norm": 0.4356038570404053, "kl": 0.02766430750489235, "learning_rate": 3.2950995744125986e-06, "loss": 0.0011, "reward": 7.627132177352905, "reward_std": 1.2347041498869658, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.46968749165534973, "rewards/check_answer": 2.232444867491722, "step": 269 }, { "completion_length": 342.1875, "epoch": 1.3775510204081631, "grad_norm": 76.3337173461914, "kl": 0.06372056156396866, "learning_rate": 3.2810096775326807e-06, "loss": 0.0025, "reward": 6.860260605812073, "reward_std": 0.8331015557050705, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.46968749165534973, "rewards/check_answer": 1.4655731916427612, "step": 270 }, { "completion_length": 435.875, "epoch": 1.3826530612244898, "grad_norm": 0.40490594506263733, "kl": 0.03950034361332655, "learning_rate": 3.2668922356043393e-06, "loss": 0.0016, "reward": 17.05094587802887, "reward_std": 2.617586500942707, "rewards/_soft_format_reward_func": 1.9437499940395355, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.10237498581409454, "rewards/check_answer": 12.004821479320526, "step": 271 }, { "completion_length": 505.5625, "epoch": 1.3877551020408163, "grad_norm": 0.31562700867652893, "kl": 0.03630805341526866, "learning_rate": 3.2527477465287315e-06, "loss": 0.0015, "reward": 8.585829615592957, "reward_std": 2.7087118178606033, "rewards/_soft_format_reward_func": 1.65625, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.37574999034404755, "rewards/check_answer": 3.7413295432925224, "step": 272 }, { "completion_length": 390.75, "epoch": 1.3928571428571428, "grad_norm": 0.39003291726112366, "kl": 0.036192905623465776, "learning_rate": 3.2385767091609256e-06, "loss": 0.0014, "reward": 15.147390484809875, "reward_std": 2.444808963686228, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 9.646390154957771, "step": 273 }, { "completion_length": 343.625, "epoch": 1.3979591836734695, "grad_norm": 0.4384270906448364, "kl": 0.04466715827584267, "learning_rate": 3.2243796232923097e-06, "loss": 0.0018, "reward": 31.85398769378662, "reward_std": 11.348299875855446, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 26.352985858917236, "step": 274 }, { "completion_length": 324.9375, "epoch": 1.403061224489796, "grad_norm": 0.2867165207862854, "kl": 0.03754226490855217, "learning_rate": 3.210156989632963e-06, "loss": 0.0015, "reward": 7.19976270198822, "reward_std": 0.6154328808188438, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 1.6987627260386944, "step": 275 }, { "completion_length": 354.8125, "epoch": 1.4081632653061225, "grad_norm": 0.29476699233055115, "kl": 0.03253966011106968, "learning_rate": 3.1959093097939985e-06, "loss": 0.0013, "reward": 7.976716876029968, "reward_std": 0.40102337673306465, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 2.4757168292999268, "step": 276 }, { "completion_length": 351.0, "epoch": 1.413265306122449, "grad_norm": 0.312061607837677, "kl": 0.0403234614059329, "learning_rate": 3.1816370862698687e-06, "loss": 0.0016, "reward": 6.9380176067352295, "reward_std": 0.2944545615464449, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 1.4370175302028656, "step": 277 }, { "completion_length": 363.75, "epoch": 1.4183673469387754, "grad_norm": 1.5660181045532227, "kl": 0.05030357465147972, "learning_rate": 3.167340822420646e-06, "loss": 0.002, "reward": 9.957616448402405, "reward_std": 1.670918844640255, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.3322499990463257, "rewards/check_answer": 4.625366747379303, "step": 278 }, { "completion_length": 823.5, "epoch": 1.4234693877551021, "grad_norm": 0.3058522641658783, "kl": 0.032453726103994995, "learning_rate": 3.15302102245427e-06, "loss": 0.0013, "reward": 5.786714851856232, "reward_std": 1.322007343173027, "rewards/_soft_format_reward_func": 1.5750000029802322, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": 0.04399999603629112, "rewards/check_answer": 1.542714830377463, "step": 279 }, { "completion_length": 512.625, "epoch": 1.4285714285714286, "grad_norm": 1.387960433959961, "kl": 0.1971198613755405, "learning_rate": 3.1386781914087644e-06, "loss": 0.0079, "reward": 7.7154969573020935, "reward_std": 1.7870135828852654, "rewards/_soft_format_reward_func": 1.875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": 0.049124978482723236, "rewards/check_answer": 3.166372127830982, "step": 280 }, { "completion_length": 399.75, "epoch": 1.433673469387755, "grad_norm": 0.25350087881088257, "kl": 0.05288520269095898, "learning_rate": 3.124312835134423e-06, "loss": 0.0021, "reward": 5.986119747161865, "reward_std": 1.6316592246294022, "rewards/_soft_format_reward_func": 1.4500000029802322, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.18143747746944427, "rewards/check_answer": 1.5421821877826005, "step": 281 }, { "completion_length": 341.4375, "epoch": 1.4387755102040816, "grad_norm": 0.2632606625556946, "kl": 0.040725668892264366, "learning_rate": 3.109925460275972e-06, "loss": 0.0016, "reward": 11.454934000968933, "reward_std": 3.1375857144594193, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 5.95393431186676, "step": 282 }, { "completion_length": 372.5, "epoch": 1.443877551020408, "grad_norm": 0.41861191391944885, "kl": 0.03541993070393801, "learning_rate": 3.095516574254701e-06, "loss": 0.0014, "reward": 11.167982816696167, "reward_std": 3.0760795176029205, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 5.666983067989349, "step": 283 }, { "completion_length": 419.0, "epoch": 1.4489795918367347, "grad_norm": 0.3124421238899231, "kl": 0.032168209087103605, "learning_rate": 3.081086685250565e-06, "loss": 0.0013, "reward": 7.30214262008667, "reward_std": 2.5055956970900297, "rewards/_soft_format_reward_func": 1.862500011920929, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.08562497794628143, "rewards/check_answer": 2.5415174663066864, "step": 284 }, { "completion_length": 451.3125, "epoch": 1.4540816326530612, "grad_norm": 0.5122058987617493, "kl": 0.04522203654050827, "learning_rate": 3.0666363021842637e-06, "loss": 0.0018, "reward": 9.730563640594482, "reward_std": 2.6023759096860886, "rewards/_soft_format_reward_func": 1.925000011920929, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.46187499165534973, "rewards/check_answer": 4.343688324093819, "step": 285 }, { "completion_length": 432.5, "epoch": 1.4591836734693877, "grad_norm": 0.26502475142478943, "kl": 0.03718484891578555, "learning_rate": 3.0521659346992914e-06, "loss": 0.0015, "reward": 10.602559328079224, "reward_std": 3.779228514060378, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.38899998739361763, "rewards/check_answer": 5.213559329509735, "step": 286 }, { "completion_length": 514.875, "epoch": 1.4642857142857144, "grad_norm": 0.5324726104736328, "kl": 0.02997216023504734, "learning_rate": 3.0376760931439636e-06, "loss": 0.0012, "reward": 6.663404941558838, "reward_std": 0.3998007522895932, "rewards/_soft_format_reward_func": 1.962499976158142, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.48374999314546585, "rewards/check_answer": 1.2171547338366508, "step": 287 }, { "completion_length": 363.8125, "epoch": 1.469387755102041, "grad_norm": 0.8685181140899658, "kl": 0.05071080569177866, "learning_rate": 3.0231672885534162e-06, "loss": 0.002, "reward": 10.159613728523254, "reward_std": 3.263587534427643, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.24118748307228088, "rewards/check_answer": 5.168426126241684, "step": 288 }, { "completion_length": 505.5625, "epoch": 1.4744897959183674, "grad_norm": 0.4313875436782837, "kl": 0.023334636818617582, "learning_rate": 3.0086400326315853e-06, "loss": 0.0009, "reward": 13.465148687362671, "reward_std": 3.8368625193834305, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.19637498259544373, "rewards/check_answer": 8.518772959709167, "step": 289 }, { "completion_length": 363.0, "epoch": 1.4795918367346939, "grad_norm": 1.4757329225540161, "kl": 0.056132697500288486, "learning_rate": 2.9940948377331545e-06, "loss": 0.0022, "reward": 8.671097755432129, "reward_std": 1.176779517903924, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 3.170098103582859, "step": 290 }, { "completion_length": 261.75, "epoch": 1.4846938775510203, "grad_norm": 0.3565198481082916, "kl": 0.0635771295055747, "learning_rate": 2.9795322168454913e-06, "loss": 0.0025, "reward": 8.113892078399658, "reward_std": 1.454827919602394, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.29224999621510506, "rewards/check_answer": 3.071641981601715, "step": 291 }, { "completion_length": 386.5, "epoch": 1.489795918367347, "grad_norm": 231.2912139892578, "kl": 6.115159600973129, "learning_rate": 2.964952683570552e-06, "loss": 0.2446, "reward": 14.47762405872345, "reward_std": 5.436617307364941, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": 0.16768748685717583, "rewards/check_answer": 9.559936925768852, "step": 292 }, { "completion_length": 577.9375, "epoch": 1.4948979591836735, "grad_norm": 0.21812781691551208, "kl": 0.019935126649215817, "learning_rate": 2.950356752106766e-06, "loss": 0.0008, "reward": 8.718894958496094, "reward_std": 1.5298770144581795, "rewards/_soft_format_reward_func": 1.7000000029802322, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.37574999034404755, "rewards/check_answer": 3.643144518136978, "step": 293 }, { "completion_length": 475.1875, "epoch": 1.5, "grad_norm": 0.3457328975200653, "kl": 0.02667845878750086, "learning_rate": 2.935744937230903e-06, "loss": 0.0011, "reward": 7.602374076843262, "reward_std": 0.3219727333635092, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 2.101373940706253, "step": 294 }, { "completion_length": 485.625, "epoch": 1.5051020408163265, "grad_norm": 14.412628173828125, "kl": 0.07801831932738423, "learning_rate": 2.921117754279917e-06, "loss": 0.0031, "reward": 6.41039764881134, "reward_std": 1.3181462110951543, "rewards/_soft_format_reward_func": 1.875, "rewards/_strict_format_reward_func": 2.625, "rewards/_xml_count_reward_func": 0.136687483638525, "rewards/check_answer": 1.7737102061510086, "step": 295 }, { "completion_length": 680.3125, "epoch": 1.510204081632653, "grad_norm": 0.3333602845668793, "kl": 0.021263310685753822, "learning_rate": 2.906475719132771e-06, "loss": 0.0009, "reward": 60.51475948095322, "reward_std": 46.97568482183851, "rewards/_soft_format_reward_func": 1.5062499940395355, "rewards/_strict_format_reward_func": 2.25, "rewards/_xml_count_reward_func": 0.013374999165534973, "rewards/check_answer": 56.74513205885887, "step": 296 }, { "completion_length": 331.3125, "epoch": 1.5153061224489797, "grad_norm": 0.39851707220077515, "kl": 0.03025135211646557, "learning_rate": 2.891819348192243e-06, "loss": 0.0012, "reward": 5.532482147216797, "reward_std": 1.5071395635604858, "rewards/_soft_format_reward_func": 1.9375, "rewards/_strict_format_reward_func": 2.8125, "rewards/_xml_count_reward_func": -0.23243750259280205, "rewards/check_answer": 1.0149196833372116, "step": 297 }, { "completion_length": 351.4375, "epoch": 1.5204081632653061, "grad_norm": 0.6662870049476624, "kl": 0.05197958368808031, "learning_rate": 2.8771491583667134e-06, "loss": 0.0021, "reward": 8.108211636543274, "reward_std": 1.016579732298851, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.28837499022483826, "rewards/check_answer": 2.8198367804288864, "step": 298 }, { "completion_length": 351.9375, "epoch": 1.5255102040816326, "grad_norm": 0.5090453028678894, "kl": 0.05331577826291323, "learning_rate": 2.8624656670519335e-06, "loss": 0.0021, "reward": 10.280625343322754, "reward_std": 2.0579520761966705, "rewards/_soft_format_reward_func": 2.0, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.5009999871253967, "rewards/check_answer": 4.7796255350112915, "step": 299 }, { "completion_length": 292.5, "epoch": 1.5306122448979593, "grad_norm": 0.2967124879360199, "kl": 0.06189478933811188, "learning_rate": 2.847769392112779e-06, "loss": 0.0025, "reward": 11.612756848335266, "reward_std": 1.1044548898935318, "rewards/_soft_format_reward_func": 1.9625000059604645, "rewards/_strict_format_reward_func": 3.0, "rewards/_xml_count_reward_func": 0.48531249165534973, "rewards/check_answer": 6.164944142103195, "step": 300 } ], "logging_steps": 1, "max_steps": 588, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }