{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987438399845395, "eval_steps": 100, "global_step": 646, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 928.2598609924316, "epoch": 0.015460430959512996, "grad_norm": 0.005825439665555334, "kl": 0.0004873394966125488, "learning_rate": 3.0769230769230774e-06, "loss": 0.0, "reward": 0.6422783114481717, "reward_std": 0.6085492318496108, "rewards/accuracy_reward": 0.16450893601868302, "rewards/cosine_scaled_reward": -0.15619643366662786, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.6339658062905074, "step": 10 }, { "completion_length": 834.5876502990723, "epoch": 0.03092086191902599, "grad_norm": 0.005061101750122049, "kl": 0.005776357650756836, "learning_rate": 6.153846153846155e-06, "loss": 0.0002, "reward": 1.1410260727629065, "reward_std": 0.6005165675655008, "rewards/accuracy_reward": 0.3330357288941741, "rewards/cosine_scaled_reward": 0.021941200397304784, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.786049148067832, "step": 20 }, { "completion_length": 794.6511512756348, "epoch": 0.04638129287853899, "grad_norm": 0.004186908324406587, "kl": 0.012411689758300782, "learning_rate": 9.230769230769232e-06, "loss": 0.0005, "reward": 1.5154079463332892, "reward_std": 0.5513344288803637, "rewards/accuracy_reward": 0.4193080538418144, "rewards/cosine_scaled_reward": 0.14040787946141792, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9556920122355222, "step": 30 }, { "completion_length": 823.3263763427734, "epoch": 0.06184172383805198, "grad_norm": 0.004675533239892946, "kl": 0.0160797119140625, "learning_rate": 1.230769230769231e-05, "loss": 0.0006, "reward": 1.6104407742619515, "reward_std": 0.5197742725256831, "rewards/accuracy_reward": 0.45156252263113855, "rewards/cosine_scaled_reward": 0.18000916420933208, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9788690943270921, "step": 40 }, { "completion_length": 824.9076248168946, "epoch": 0.07730215479756498, "grad_norm": 0.008209128879452155, "kl": 0.021560287475585936, "learning_rate": 1.5384615384615387e-05, "loss": 0.0009, "reward": 1.7187657799571752, "reward_std": 0.539798857551068, "rewards/accuracy_reward": 0.48928573532029984, "rewards/cosine_scaled_reward": 0.2462582775799092, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9832217764109373, "step": 50 }, { "completion_length": 776.4894325256348, "epoch": 0.09276258575707798, "grad_norm": 0.005032104484078714, "kl": 0.03041839599609375, "learning_rate": 1.8461538461538465e-05, "loss": 0.0012, "reward": 1.7944712869822979, "reward_std": 0.5402565439231694, "rewards/accuracy_reward": 0.5172991305589676, "rewards/cosine_scaled_reward": 0.2977078223892022, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.979464340955019, "step": 60 }, { "completion_length": 783.9421092987061, "epoch": 0.10822301671659097, "grad_norm": 0.046440537921451495, "kl": 0.18165512084960939, "learning_rate": 1.999634547413886e-05, "loss": 0.0073, "reward": 1.5861421424895525, "reward_std": 0.7133004866540432, "rewards/accuracy_reward": 0.46037948597222567, "rewards/cosine_scaled_reward": 0.2454426669143686, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8803199872374534, "step": 70 }, { "completion_length": 704.7571739196777, "epoch": 0.12368344767610397, "grad_norm": 0.005638881154684646, "kl": 0.14808197021484376, "learning_rate": 1.9967125291968495e-05, "loss": 0.0059, "reward": 1.770343079417944, "reward_std": 0.6161688735242933, "rewards/accuracy_reward": 0.5064732374623417, "rewards/cosine_scaled_reward": 0.31033555960966624, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9535342697054148, "step": 80 }, { "completion_length": 811.8497016906738, "epoch": 0.13914387863561697, "grad_norm": 0.006277685603677439, "kl": 0.1677825927734375, "learning_rate": 1.990877034074683e-05, "loss": 0.0067, "reward": 1.7127626728266478, "reward_std": 0.5350183860398829, "rewards/accuracy_reward": 0.4822544841095805, "rewards/cosine_scaled_reward": 0.24494265783869196, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9855655211955309, "step": 90 }, { "completion_length": 786.0161056518555, "epoch": 0.15460430959512997, "grad_norm": 0.004638658867653336, "kl": 0.20247802734375, "learning_rate": 1.9821451197042028e-05, "loss": 0.0081, "reward": 1.7241055637598037, "reward_std": 0.6285460269078612, "rewards/accuracy_reward": 0.4876116293948144, "rewards/cosine_scaled_reward": 0.27760252499065247, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9588914208114148, "step": 100 }, { "epoch": 0.15460430959512997, "eval_completion_length": 785.1983642578125, "eval_kl": 0.1015625, "eval_loss": 0.004128854256123304, "eval_reward": 1.8587820827960968, "eval_reward_std": 0.4679965078830719, "eval_rewards/accuracy_reward": 0.5345982536673546, "eval_rewards/cosine_scaled_reward": 0.33646056056022644, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9877232909202576, "eval_runtime": 65.77, "eval_samples_per_second": 1.505, "eval_steps_per_second": 0.015, "step": 100 }, { "completion_length": 795.4053916931152, "epoch": 0.17006474055464296, "grad_norm": 0.0050662218091816385, "kl": 0.20187530517578126, "learning_rate": 1.9705423102261324e-05, "loss": 0.0081, "reward": 1.7285974282771348, "reward_std": 0.6808601895347237, "rewards/accuracy_reward": 0.49296877197921274, "rewards/cosine_scaled_reward": 0.296454501109838, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9391741570085287, "step": 110 }, { "completion_length": 791.2351921081543, "epoch": 0.18552517151415596, "grad_norm": 0.005966417830813838, "kl": 0.222845458984375, "learning_rate": 1.956102521655831e-05, "loss": 0.0089, "reward": 1.7569476522505283, "reward_std": 0.6371290137991309, "rewards/accuracy_reward": 0.4906250220956281, "rewards/cosine_scaled_reward": 0.30925412904762195, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9570684999227523, "step": 120 }, { "completion_length": 778.9137634277344, "epoch": 0.20098560247366895, "grad_norm": 0.11425551975396886, "kl": 0.455560302734375, "learning_rate": 1.9388679627438486e-05, "loss": 0.0182, "reward": 1.6175578892230988, "reward_std": 0.6627502014860511, "rewards/accuracy_reward": 0.43537948445882646, "rewards/cosine_scaled_reward": 0.24683610293641323, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9353423073887825, "step": 130 }, { "completion_length": 671.9368595123291, "epoch": 0.21644603343318194, "grad_norm": 0.012687310552666826, "kl": 2.4836822509765626, "learning_rate": 1.9188890115960967e-05, "loss": 0.0994, "reward": 1.4270036322064699, "reward_std": 0.739827654324472, "rewards/accuracy_reward": 0.39709823183948173, "rewards/cosine_scaled_reward": 0.2161776867986191, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8137277197092772, "step": 140 }, { "completion_length": 703.5272651672364, "epoch": 0.23190646439269494, "grad_norm": 0.023072548858575986, "kl": 0.175665283203125, "learning_rate": 1.8962240684142923e-05, "loss": 0.007, "reward": 1.8304371915757656, "reward_std": 0.5595470611006021, "rewards/accuracy_reward": 0.5032366321422159, "rewards/cosine_scaled_reward": 0.3409654124639928, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.986235162243247, "step": 150 }, { "completion_length": 755.7974658966065, "epoch": 0.24736689535220793, "grad_norm": 0.0048724703817881404, "kl": 0.1621673583984375, "learning_rate": 1.8709393847871146e-05, "loss": 0.0065, "reward": 1.8066862165927886, "reward_std": 0.541509800683707, "rewards/accuracy_reward": 0.4974330588709563, "rewards/cosine_scaled_reward": 0.32353881540329893, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9857143286615611, "step": 160 }, { "completion_length": 767.8857475280762, "epoch": 0.26282732631172095, "grad_norm": 0.0058295760602797165, "kl": 0.1024566650390625, "learning_rate": 1.8431088700310846e-05, "loss": 0.0041, "reward": 1.8246684893965721, "reward_std": 0.6182428574189544, "rewards/accuracy_reward": 0.5167410940863192, "rewards/cosine_scaled_reward": 0.33218330084491754, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9757441036403179, "step": 170 }, { "completion_length": 782.8031581878662, "epoch": 0.27828775727123395, "grad_norm": 0.007260482392875092, "kl": 0.133380126953125, "learning_rate": 1.8128138751472432e-05, "loss": 0.0053, "reward": 1.6873359650373458, "reward_std": 0.7230455877259374, "rewards/accuracy_reward": 0.46573662804439664, "rewards/cosine_scaled_reward": 0.2658701150892739, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.955729215592146, "step": 180 }, { "completion_length": 770.7354141235352, "epoch": 0.29374818823074694, "grad_norm": 0.0038766706896374765, "kl": 0.084027099609375, "learning_rate": 1.780142955025139e-05, "loss": 0.0034, "reward": 1.8208528086543083, "reward_std": 0.6158834310248494, "rewards/accuracy_reward": 0.5102678800933063, "rewards/cosine_scaled_reward": 0.3412768360443579, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.969308077916503, "step": 190 }, { "completion_length": 777.9120876312256, "epoch": 0.30920861919025994, "grad_norm": 0.004081871056913627, "kl": 0.079278564453125, "learning_rate": 1.745191609589231e-05, "loss": 0.0032, "reward": 1.8799906723201274, "reward_std": 0.6350350034423172, "rewards/accuracy_reward": 0.5420759165659547, "rewards/cosine_scaled_reward": 0.36834625932970083, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9695684995502234, "step": 200 }, { "epoch": 0.30920861919025994, "eval_completion_length": 786.5066223144531, "eval_kl": 0.080078125, "eval_loss": 0.0031854985281825066, "eval_reward": 1.7613219320774078, "eval_reward_std": 0.6763340681791306, "eval_rewards/accuracy_reward": 0.4933036044239998, "eval_rewards/cosine_scaled_reward": 0.3011283501982689, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9668899178504944, "eval_runtime": 67.4733, "eval_samples_per_second": 1.467, "eval_steps_per_second": 0.015, "step": 200 }, { "completion_length": 762.9672210693359, "epoch": 0.32466905014977293, "grad_norm": 0.0045327243820408964, "kl": 0.0857818603515625, "learning_rate": 1.7080620046443503e-05, "loss": 0.0034, "reward": 1.8360209584236145, "reward_std": 0.6304899661801755, "rewards/accuracy_reward": 0.5189732388593257, "rewards/cosine_scaled_reward": 0.3513110678992234, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9657366566359997, "step": 210 }, { "completion_length": 740.6268199920654, "epoch": 0.3401294811092859, "grad_norm": 0.40798247676236865, "kl": 0.09603729248046874, "learning_rate": 1.6688626732362192e-05, "loss": 0.0038, "reward": 1.8989367991685868, "reward_std": 0.6170632224529982, "rewards/accuracy_reward": 0.541183059476316, "rewards/cosine_scaled_reward": 0.3866971510913572, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9710565954446793, "step": 220 }, { "completion_length": 745.6220226287842, "epoch": 0.3555899120687989, "grad_norm": 0.009310955589968223, "kl": 0.17754974365234374, "learning_rate": 1.6277081983999742e-05, "loss": 0.0071, "reward": 1.9535415962338447, "reward_std": 0.5657559703569859, "rewards/accuracy_reward": 0.5494419884867966, "rewards/cosine_scaled_reward": 0.4263093855464831, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9777902279049158, "step": 230 }, { "completion_length": 754.8473545074463, "epoch": 0.3710503430283119, "grad_norm": 0.009032184745149096, "kl": 0.1623504638671875, "learning_rate": 1.5847188782240473e-05, "loss": 0.0065, "reward": 1.8752706520259381, "reward_std": 0.6476909777149558, "rewards/accuracy_reward": 0.5162946661002934, "rewards/cosine_scaled_reward": 0.3971455840044655, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9618303928524256, "step": 240 }, { "completion_length": 767.3316184997559, "epoch": 0.3865107739878249, "grad_norm": 0.006074054783900294, "kl": 0.1158416748046875, "learning_rate": 1.5400203742084508e-05, "loss": 0.0046, "reward": 1.8485381975769997, "reward_std": 0.6796474339440465, "rewards/accuracy_reward": 0.5156250222586095, "rewards/cosine_scaled_reward": 0.3913580739754252, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9415550928562879, "step": 250 }, { "completion_length": 740.9466835021973, "epoch": 0.4019712049473379, "grad_norm": 0.004612552363152663, "kl": 0.10526580810546875, "learning_rate": 1.4937433439453465e-05, "loss": 0.0042, "reward": 1.834777297079563, "reward_std": 0.694879194535315, "rewards/accuracy_reward": 0.5040178820490837, "rewards/cosine_scaled_reward": 0.38555847499519585, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9452009297907352, "step": 260 }, { "completion_length": 769.4490287780761, "epoch": 0.4174316359068509, "grad_norm": 0.005166739754892184, "kl": 0.122613525390625, "learning_rate": 1.4460230591956097e-05, "loss": 0.0049, "reward": 1.8051817450672387, "reward_std": 0.7667457018047571, "rewards/accuracy_reward": 0.5031250216066837, "rewards/cosine_scaled_reward": 0.3666772120282985, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9353795044124127, "step": 270 }, { "completion_length": 756.1934505462647, "epoch": 0.4328920668663639, "grad_norm": 0.004779328317174938, "kl": 0.118280029296875, "learning_rate": 1.3969990104777712e-05, "loss": 0.0047, "reward": 1.835938386246562, "reward_std": 0.6989197930321097, "rewards/accuracy_reward": 0.5044643082190305, "rewards/cosine_scaled_reward": 0.3808415879495442, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9506324753165245, "step": 280 }, { "completion_length": 753.9099658966064, "epoch": 0.4483524978258769, "grad_norm": 0.006205432336241612, "kl": 0.12601318359375, "learning_rate": 1.3468144993251735e-05, "loss": 0.005, "reward": 1.8052862711250781, "reward_std": 0.6413127107545733, "rewards/accuracy_reward": 0.47890627244487405, "rewards/cosine_scaled_reward": 0.3571091307036113, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9692708767950535, "step": 290 }, { "completion_length": 766.8500350952148, "epoch": 0.4638129287853899, "grad_norm": 0.005053729003460748, "kl": 0.1371002197265625, "learning_rate": 1.295616219403197e-05, "loss": 0.0055, "reward": 1.7713046602904796, "reward_std": 0.6539058156311512, "rewards/accuracy_reward": 0.4574776992201805, "rewards/cosine_scaled_reward": 0.34656501180725174, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.967261953279376, "step": 300 }, { "epoch": 0.4638129287853899, "eval_completion_length": 725.3372497558594, "eval_kl": 0.125732421875, "eval_loss": 0.005167535971850157, "eval_reward": 1.8545046150684357, "eval_reward_std": 0.5993074476718903, "eval_rewards/accuracy_reward": 0.4888393133878708, "eval_rewards/cosine_scaled_reward": 0.3980313614010811, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9676340073347092, "eval_runtime": 63.1453, "eval_samples_per_second": 1.568, "eval_steps_per_second": 0.016, "step": 300 }, { "completion_length": 738.0375347137451, "epoch": 0.47927335974490287, "grad_norm": 0.004708932814585316, "kl": 0.128253173828125, "learning_rate": 1.2435538277109919e-05, "loss": 0.0051, "reward": 1.776976404339075, "reward_std": 0.6543458372354507, "rewards/accuracy_reward": 0.4662946649361402, "rewards/cosine_scaled_reward": 0.35774270847914524, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9529390316456556, "step": 310 }, { "completion_length": 730.0644290924072, "epoch": 0.49473379070441587, "grad_norm": 0.006404744910772637, "kl": 0.12236328125, "learning_rate": 1.19077950712113e-05, "loss": 0.0049, "reward": 1.8439508713781834, "reward_std": 0.6846362385898829, "rewards/accuracy_reward": 0.500669667404145, "rewards/cosine_scaled_reward": 0.3922022982500494, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9510789047926664, "step": 320 }, { "completion_length": 733.6488037109375, "epoch": 0.5101942216639289, "grad_norm": 0.005310241047036926, "kl": 0.1285675048828125, "learning_rate": 1.137447521535908e-05, "loss": 0.0051, "reward": 1.8017703101038933, "reward_std": 0.670677787438035, "rewards/accuracy_reward": 0.46941966488957404, "rewards/cosine_scaled_reward": 0.3702226262510521, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9621280215680599, "step": 330 }, { "completion_length": 740.5896522521973, "epoch": 0.5256546526234419, "grad_norm": 0.004911848589025536, "kl": 0.125958251953125, "learning_rate": 1.0837137649606241e-05, "loss": 0.005, "reward": 1.8196691133081913, "reward_std": 0.6627934613265097, "rewards/accuracy_reward": 0.4854910961352289, "rewards/cosine_scaled_reward": 0.37692351534496993, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.957254507765174, "step": 340 }, { "completion_length": 733.2659954071045, "epoch": 0.5411150835829549, "grad_norm": 0.009426685166535624, "kl": 0.1310546875, "learning_rate": 1.0297353058119209e-05, "loss": 0.0052, "reward": 1.7875644348561763, "reward_std": 0.6663354218006134, "rewards/accuracy_reward": 0.46261162832379343, "rewards/cosine_scaled_reward": 0.36353164007887245, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.961421174928546, "step": 350 }, { "completion_length": 755.5462394714356, "epoch": 0.5565755145424679, "grad_norm": 0.005204829040206616, "kl": 0.14141845703125, "learning_rate": 9.756699277932196e-06, "loss": 0.0057, "reward": 1.7464446134865284, "reward_std": 0.6827127303928137, "rewards/accuracy_reward": 0.43928573140874505, "rewards/cosine_scaled_reward": 0.3423150799470022, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9648437988013029, "step": 360 }, { "completion_length": 738.9675567626953, "epoch": 0.5720359455019809, "grad_norm": 0.0050950433186417, "kl": 0.133477783203125, "learning_rate": 9.216756686793163e-06, "loss": 0.0053, "reward": 1.7593348406255245, "reward_std": 0.7046971999108791, "rewards/accuracy_reward": 0.4560268087312579, "rewards/cosine_scaled_reward": 0.35353119419887663, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9497768227010965, "step": 370 }, { "completion_length": 715.9590724945068, "epoch": 0.5874963764614939, "grad_norm": 0.005868130396446593, "kl": 0.1201171875, "learning_rate": 8.67910358358298e-06, "loss": 0.0048, "reward": 1.8290306769311429, "reward_std": 0.7089241919107735, "rewards/accuracy_reward": 0.4906250239349902, "rewards/cosine_scaled_reward": 0.3883312027202919, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9500744428485632, "step": 380 }, { "completion_length": 758.1067291259766, "epoch": 0.6029568074210069, "grad_norm": 0.005528799006616127, "kl": 0.1315093994140625, "learning_rate": 8.145311574811325e-06, "loss": 0.0053, "reward": 1.6966661393642426, "reward_std": 0.7609130211174489, "rewards/accuracy_reward": 0.45424109399318696, "rewards/cosine_scaled_reward": 0.32028957750299014, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9221354588866234, "step": 390 }, { "completion_length": 731.6211277008057, "epoch": 0.6184172383805199, "grad_norm": 0.006163761009715641, "kl": 0.130072021484375, "learning_rate": 7.616940980675004e-06, "loss": 0.0052, "reward": 1.7418419629335404, "reward_std": 0.7564100152812898, "rewards/accuracy_reward": 0.46339287841692567, "rewards/cosine_scaled_reward": 0.34221391292085174, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9362351588904858, "step": 400 }, { "epoch": 0.6184172383805199, "eval_completion_length": 721.2921142578125, "eval_kl": 0.14404296875, "eval_loss": 0.005833905190229416, "eval_reward": 1.8010995388031006, "eval_reward_std": 0.79125015437603, "eval_rewards/accuracy_reward": 0.4899553880095482, "eval_rewards/cosine_scaled_reward": 0.3773643299937248, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9337798058986664, "eval_runtime": 64.0844, "eval_samples_per_second": 1.545, "eval_steps_per_second": 0.016, "step": 400 }, { "completion_length": 730.9440063476562, "epoch": 0.6338776693400329, "grad_norm": 0.007004458345967938, "kl": 0.1326690673828125, "learning_rate": 7.095536274107046e-06, "loss": 0.0053, "reward": 1.7348041359335185, "reward_std": 0.7573289098218083, "rewards/accuracy_reward": 0.46227680711308494, "rewards/cosine_scaled_reward": 0.3395287758205086, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.93299855068326, "step": 410 }, { "completion_length": 737.4034927368164, "epoch": 0.6493381002995459, "grad_norm": 0.006302023763263314, "kl": 0.1422760009765625, "learning_rate": 6.58262156614881e-06, "loss": 0.0057, "reward": 1.7033680249005556, "reward_std": 0.7371486462652683, "rewards/accuracy_reward": 0.43750002002343535, "rewards/cosine_scaled_reward": 0.32375487285316923, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.942113135010004, "step": 420 }, { "completion_length": 756.3276016235352, "epoch": 0.6647985312590589, "grad_norm": 0.008166583966853302, "kl": 0.149725341796875, "learning_rate": 6.079696150841634e-06, "loss": 0.006, "reward": 1.6697823703289032, "reward_std": 0.7648335263133049, "rewards/accuracy_reward": 0.4290178781375289, "rewards/cosine_scaled_reward": 0.30843557265470734, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9323289088904858, "step": 430 }, { "completion_length": 711.6224662780762, "epoch": 0.6802589622185718, "grad_norm": 0.006101275201994206, "kl": 0.149908447265625, "learning_rate": 5.588230122660672e-06, "loss": 0.006, "reward": 1.710378536581993, "reward_std": 0.7376122187823058, "rewards/accuracy_reward": 0.43995537869632245, "rewards/cosine_scaled_reward": 0.3315466307423776, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9388765264302492, "step": 440 }, { "completion_length": 720.1637599945068, "epoch": 0.6957193931780848, "grad_norm": 0.00827738551813243, "kl": 0.1536865234375, "learning_rate": 5.109660079301668e-06, "loss": 0.0061, "reward": 1.7479658477008342, "reward_std": 0.7545963631942868, "rewards/accuracy_reward": 0.45546877263113855, "rewards/cosine_scaled_reward": 0.3493794774003618, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9431175928562879, "step": 450 }, { "completion_length": 719.8788265228271, "epoch": 0.7111798241375978, "grad_norm": 0.009020213190404006, "kl": 0.146099853515625, "learning_rate": 4.64538492238166e-06, "loss": 0.0058, "reward": 1.761041846126318, "reward_std": 0.7622619468718768, "rewards/accuracy_reward": 0.46506698690354825, "rewards/cosine_scaled_reward": 0.3563170699868351, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9396577756851912, "step": 460 }, { "completion_length": 715.616215133667, "epoch": 0.7266402550971108, "grad_norm": 0.009535640148387967, "kl": 0.1488037109375, "learning_rate": 4.196761768328599e-06, "loss": 0.006, "reward": 1.7519984051585198, "reward_std": 0.7264958534389734, "rewards/accuracy_reward": 0.45613841600716115, "rewards/cosine_scaled_reward": 0.35062185342776503, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9452381365001201, "step": 470 }, { "completion_length": 733.5050575256348, "epoch": 0.7421006860566238, "grad_norm": 0.009375726733768255, "kl": 0.1457244873046875, "learning_rate": 3.7651019814126656e-06, "loss": 0.0058, "reward": 1.7320308901369572, "reward_std": 0.753149107657373, "rewards/accuracy_reward": 0.45301341358572245, "rewards/cosine_scaled_reward": 0.3402525488520041, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9387649200856686, "step": 480 }, { "completion_length": 725.7989181518554, "epoch": 0.7575611170161368, "grad_norm": 0.006465310695991136, "kl": 0.1441925048828125, "learning_rate": 3.3516673405151546e-06, "loss": 0.0058, "reward": 1.7133542537689208, "reward_std": 0.7624470146372915, "rewards/accuracy_reward": 0.4443080571014434, "rewards/cosine_scaled_reward": 0.3332202689955011, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9358259223401546, "step": 490 }, { "completion_length": 732.0432247161865, "epoch": 0.7730215479756498, "grad_norm": 0.006663296056320388, "kl": 0.1493438720703125, "learning_rate": 2.957666350839663e-06, "loss": 0.006, "reward": 1.7120833061635494, "reward_std": 0.7427917202934623, "rewards/accuracy_reward": 0.44140627095475793, "rewards/cosine_scaled_reward": 0.3323585350837675, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9383184887468815, "step": 500 }, { "epoch": 0.7730215479756498, "eval_completion_length": 724.1022491455078, "eval_kl": 0.14892578125, "eval_loss": 0.006081230938434601, "eval_reward": 1.7919847667217255, "eval_reward_std": 0.7341814786195755, "eval_rewards/accuracy_reward": 0.474330373108387, "eval_rewards/cosine_scaled_reward": 0.3756899982690811, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9419643133878708, "eval_runtime": 62.9452, "eval_samples_per_second": 1.573, "eval_steps_per_second": 0.016, "step": 500 }, { "completion_length": 719.1628688812256, "epoch": 0.7884819789351628, "grad_norm": 0.026910990374737, "kl": 0.1684112548828125, "learning_rate": 2.5842507113469307e-06, "loss": 0.0067, "reward": 1.6821819383651018, "reward_std": 0.7549204783514142, "rewards/accuracy_reward": 0.42008930565789343, "rewards/cosine_scaled_reward": 0.3171893151884433, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9449033126235008, "step": 510 }, { "completion_length": 703.1540473937988, "epoch": 0.8039424098946758, "grad_norm": 0.029497387730934427, "kl": 0.1495452880859375, "learning_rate": 2.2325119482391466e-06, "loss": 0.006, "reward": 1.7537529528141023, "reward_std": 0.7176604120060801, "rewards/accuracy_reward": 0.44877234250307085, "rewards/cosine_scaled_reward": 0.3511859173071571, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9537946797907353, "step": 520 }, { "completion_length": 715.8909927368164, "epoch": 0.8194028408541888, "grad_norm": 0.006911653084698067, "kl": 0.1466156005859375, "learning_rate": 1.9034782243345074e-06, "loss": 0.0059, "reward": 1.7353017818182708, "reward_std": 0.7042613643221557, "rewards/accuracy_reward": 0.4434151995461434, "rewards/cosine_scaled_reward": 0.34125408774707466, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9506324734538794, "step": 530 }, { "completion_length": 731.9873096466065, "epoch": 0.8348632718137018, "grad_norm": 0.10031774065756535, "kl": 0.165179443359375, "learning_rate": 1.5981113336584041e-06, "loss": 0.0066, "reward": 1.720738895609975, "reward_std": 0.7829023336991667, "rewards/accuracy_reward": 0.44453127147862687, "rewards/cosine_scaled_reward": 0.33692186851403677, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9392857551574707, "step": 540 }, { "completion_length": 726.0302787780762, "epoch": 0.8503237027732148, "grad_norm": 0.00915840041448343, "kl": 0.1617706298828125, "learning_rate": 1.3173038900362977e-06, "loss": 0.0065, "reward": 1.7284724555909634, "reward_std": 0.7755123546347023, "rewards/accuracy_reward": 0.4477678783237934, "rewards/cosine_scaled_reward": 0.34402298720087854, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9366815879940986, "step": 550 }, { "completion_length": 716.5763721466064, "epoch": 0.8657841337327278, "grad_norm": 0.0077065633985853085, "kl": 0.151544189453125, "learning_rate": 1.0618767179063416e-06, "loss": 0.0061, "reward": 1.7493106886744498, "reward_std": 0.7468110140413046, "rewards/accuracy_reward": 0.45625002135057, "rewards/cosine_scaled_reward": 0.3529192515881732, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9401414062827825, "step": 560 }, { "completion_length": 711.7772666931153, "epoch": 0.8812445646922408, "grad_norm": 0.011223015630773887, "kl": 0.1598358154296875, "learning_rate": 8.325764529785851e-07, "loss": 0.0064, "reward": 1.7419822074472904, "reward_std": 0.7288113379850983, "rewards/accuracy_reward": 0.45122770036105064, "rewards/cosine_scaled_reward": 0.34804613249725663, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9427083749324083, "step": 570 }, { "completion_length": 717.4036037445069, "epoch": 0.8967049956517538, "grad_norm": 0.01473136538282227, "kl": 0.1699462890625, "learning_rate": 6.300733597542086e-07, "loss": 0.0068, "reward": 1.7380871541798115, "reward_std": 0.7284659473225474, "rewards/accuracy_reward": 0.4454241285100579, "rewards/cosine_scaled_reward": 0.3448207150679082, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9478422913700342, "step": 580 }, { "completion_length": 722.4015926361084, "epoch": 0.9121654266112668, "grad_norm": 0.015247562461578802, "kl": 0.1722503662109375, "learning_rate": 4.549593722844492e-07, "loss": 0.0069, "reward": 1.7376306042075158, "reward_std": 0.7329583563841879, "rewards/accuracy_reward": 0.4400669841095805, "rewards/cosine_scaled_reward": 0.34566624723374845, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.951897357404232, "step": 590 }, { "completion_length": 719.5832901000977, "epoch": 0.9276258575707798, "grad_norm": 0.008595325912121869, "kl": 0.1673126220703125, "learning_rate": 3.0774636389618196e-07, "loss": 0.0067, "reward": 1.7701299749314785, "reward_std": 0.7306436906568706, "rewards/accuracy_reward": 0.4577009153552353, "rewards/cosine_scaled_reward": 0.360122480080463, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9523065883666277, "step": 600 }, { "epoch": 0.9276258575707798, "eval_completion_length": 705.7041168212891, "eval_kl": 0.164794921875, "eval_loss": 0.006647891830652952, "eval_reward": 1.8423524498939514, "eval_reward_std": 0.6980961859226227, "eval_rewards/accuracy_reward": 0.4832589626312256, "eval_rewards/cosine_scaled_reward": 0.39815596491098404, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9609375596046448, "eval_runtime": 63.3214, "eval_samples_per_second": 1.563, "eval_steps_per_second": 0.016, "step": 600 }, { "completion_length": 719.9855236053467, "epoch": 0.9430862885302927, "grad_norm": 0.014629584328010486, "kl": 0.17073974609375, "learning_rate": 1.8886465094192895e-07, "loss": 0.0068, "reward": 1.7343647606670856, "reward_std": 0.7088968453928828, "rewards/accuracy_reward": 0.4390625214669853, "rewards/cosine_scaled_reward": 0.3427352339422214, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9525669939815998, "step": 610 }, { "completion_length": 721.9812828063965, "epoch": 0.9585467194898057, "grad_norm": 0.020088112225356343, "kl": 0.1849456787109375, "learning_rate": 9.866173494794462e-08, "loss": 0.0074, "reward": 1.7370413817465304, "reward_std": 0.7334370331838727, "rewards/accuracy_reward": 0.44151787713635715, "rewards/cosine_scaled_reward": 0.34656511796929407, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9489583697170019, "step": 620 }, { "completion_length": 724.9088500976562, "epoch": 0.9740071504493187, "grad_norm": 0.009230798738629389, "kl": 0.179193115234375, "learning_rate": 3.7401286837214224e-08, "loss": 0.0072, "reward": 1.7149522617459296, "reward_std": 0.740879999101162, "rewards/accuracy_reward": 0.43069198355078697, "rewards/cosine_scaled_reward": 0.3343346292153001, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9499256368726492, "step": 630 }, { "completion_length": 733.0805023193359, "epoch": 0.9894675814088317, "grad_norm": 0.013971562093972711, "kl": 0.177264404296875, "learning_rate": 5.262376196544239e-09, "loss": 0.0071, "reward": 1.6887946531176568, "reward_std": 0.7455704480409622, "rewards/accuracy_reward": 0.4194196627475321, "rewards/cosine_scaled_reward": 0.3205654217163101, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9488095600157976, "step": 640 }, { "completion_length": 726.9589246114095, "epoch": 0.9987438399845395, "kl": 0.1743927001953125, "reward": 1.7412781628469627, "reward_std": 0.7270878640313944, "rewards/accuracy_reward": 0.444568472293516, "rewards/cosine_scaled_reward": 0.34755290367562947, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9491567853838205, "step": 646, "total_flos": 0.0, "train_loss": 0.007009532302370239, "train_runtime": 74639.7368, "train_samples_per_second": 0.971, "train_steps_per_second": 0.009 } ], "logging_steps": 10, "max_steps": 646, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }