{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 254.375, "epoch": 0.00025, "grad_norm": 0.37493768334388733, "kl": 0.0, "learning_rate": 1.2500000000000001e-08, "loss": -0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 1 }, { "completion_length": 220.875, "epoch": 0.0005, "grad_norm": 0.37041813135147095, "kl": 0.0, "learning_rate": 2.5000000000000002e-08, "loss": -0.0, "reward": 0.7687499523162842, "reward_std": 0.7151111364364624, "rewards/_accuracy_reward": 0.26874998211860657, "rewards/_format_reward": 0.5, "step": 2 }, { "completion_length": 225.5, "epoch": 0.00075, "grad_norm": 0.37694671750068665, "kl": 0.0006723726983182132, "learning_rate": 3.7500000000000005e-08, "loss": 0.0, "reward": 0.7875000238418579, "reward_std": 0.9026746153831482, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 0.5, "step": 3 }, { "completion_length": 238.875, "epoch": 0.001, "grad_norm": 0.34139618277549744, "kl": 0.0005685070063918829, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": 0.375, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.125, "step": 4 }, { "completion_length": 179.75, "epoch": 0.00125, "grad_norm": 0.3876967430114746, "kl": 0.0005563868908211589, "learning_rate": 6.250000000000001e-08, "loss": 0.0, "reward": 1.125, "reward_std": 0.9910312294960022, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.5, "step": 5 }, { "completion_length": 210.5, "epoch": 0.0015, "grad_norm": 0.32642900943756104, "kl": 0.0006381691200658679, "learning_rate": 7.500000000000001e-08, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 6 }, { "completion_length": 114.875, "epoch": 0.00175, "grad_norm": 0.63201904296875, "kl": 0.00071949657285586, "learning_rate": 8.750000000000001e-08, "loss": 0.0, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.625, "step": 7 }, { "completion_length": 233.375, "epoch": 0.002, "grad_norm": 0.6539371013641357, "kl": 0.0006772859487682581, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "reward": 0.13749998807907104, "reward_std": 0.3691205382347107, "rewards/_accuracy_reward": 0.012500000186264515, "rewards/_format_reward": 0.125, "step": 8 }, { "completion_length": 201.875, "epoch": 0.00225, "grad_norm": 0.45297423005104065, "kl": 0.0006543596973642707, "learning_rate": 1.1250000000000001e-07, "loss": 0.0, "reward": 0.875, "reward_std": 0.9910312294960022, "rewards/_accuracy_reward": 0.5, "rewards/_format_reward": 0.375, "step": 9 }, { "completion_length": 194.5, "epoch": 0.0025, "grad_norm": 0.4227246046066284, "kl": 0.000667984364554286, "learning_rate": 1.2500000000000002e-07, "loss": 0.0, "reward": 0.71875, "reward_std": 0.8066409826278687, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.5, "step": 10 }, { "completion_length": 243.0, "epoch": 0.00275, "grad_norm": 0.0012034112587571144, "kl": 0.0005753615405410528, "learning_rate": 1.375e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/_accuracy_reward": 0.0, "rewards/_format_reward": 0.0, "step": 11 }, { "completion_length": 209.625, "epoch": 0.003, "grad_norm": 0.3236827254295349, "kl": 0.0004996673669666052, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": 0.574999988079071, "reward_std": 0.619331419467926, "rewards/_accuracy_reward": 0.07500000298023224, "rewards/_format_reward": 0.5, "step": 12 }, { "completion_length": 226.375, "epoch": 0.00325, "grad_norm": 0.3909885287284851, "kl": 0.0006868684431537986, "learning_rate": 1.625e-07, "loss": 0.0, "reward": 0.34375, "reward_std": 0.4988826811313629, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.125, "step": 13 }, { "completion_length": 232.0, "epoch": 0.0035, "grad_norm": 0.2995592951774597, "kl": 0.0005955615197308362, "learning_rate": 1.7500000000000002e-07, "loss": 0.0, "reward": 0.5625, "reward_std": 0.810092568397522, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.375, "step": 14 }, { "completion_length": 172.625, "epoch": 0.00375, "grad_norm": 0.5309815406799316, "kl": 0.0006232442683540285, "learning_rate": 1.875e-07, "loss": 0.0, "reward": 0.6312500238418579, "reward_std": 0.9192144870758057, "rewards/_accuracy_reward": 0.2562499940395355, "rewards/_format_reward": 0.375, "step": 15 }, { "completion_length": 123.375, "epoch": 0.004, "grad_norm": 0.3811752200126648, "kl": 0.000560120097361505, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 0.96875, "reward_std": 0.5250425338745117, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.75, "step": 16 }, { "completion_length": 226.125, "epoch": 0.00425, "grad_norm": 0.3893718123435974, "kl": 0.0008009783923625946, "learning_rate": 2.1250000000000003e-07, "loss": 0.0, "reward": 0.4437499940395355, "reward_std": 0.6155354380607605, "rewards/_accuracy_reward": 0.06875000149011612, "rewards/_format_reward": 0.375, "step": 17 }, { "completion_length": 204.25, "epoch": 0.0045, "grad_norm": 0.3759849965572357, "kl": 0.0006915747653692961, "learning_rate": 2.2500000000000002e-07, "loss": 0.0, "reward": 0.5625, "reward_std": 0.7647361755371094, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 0.25, "step": 18 }, { "completion_length": 231.0, "epoch": 0.00475, "grad_norm": 0.34655511379241943, "kl": 0.000734607398044318, "learning_rate": 2.3750000000000003e-07, "loss": 0.0, "reward": 0.53125, "reward_std": 0.9106267690658569, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.25, "step": 19 }, { "completion_length": 244.25, "epoch": 0.005, "grad_norm": 0.32063984870910645, "kl": 0.0005592820816673338, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 20 }, { "completion_length": 237.625, "epoch": 0.00525, "grad_norm": 0.3625517189502716, "kl": 0.0005329761188477278, "learning_rate": 2.6250000000000003e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.4381372928619385, "rewards/_accuracy_reward": 0.0625, "rewards/_format_reward": 0.125, "step": 21 }, { "completion_length": 240.5, "epoch": 0.0055, "grad_norm": 0.39223137497901917, "kl": 0.0006619459018111229, "learning_rate": 2.75e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.125, "step": 22 }, { "completion_length": 243.5, "epoch": 0.00575, "grad_norm": 0.4175410866737366, "kl": 0.0005919833201915026, "learning_rate": 2.8750000000000005e-07, "loss": 0.0, "reward": 0.13124999403953552, "reward_std": 0.3712310194969177, "rewards/_accuracy_reward": 0.0062500000931322575, "rewards/_format_reward": 0.125, "step": 23 }, { "completion_length": 221.625, "epoch": 0.006, "grad_norm": 0.3331623673439026, "kl": 0.0006004280294291675, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": 0.13124999403953552, "reward_std": 0.3712310194969177, "rewards/_accuracy_reward": 0.0062500000931322575, "rewards/_format_reward": 0.125, "step": 24 }, { "completion_length": 235.0, "epoch": 0.00625, "grad_norm": 0.3354165852069855, "kl": 0.0007247485918924212, "learning_rate": 3.125e-07, "loss": 0.0, "reward": 0.5187499523162842, "reward_std": 0.554808497428894, "rewards/_accuracy_reward": 0.14374999701976776, "rewards/_format_reward": 0.375, "step": 25 }, { "completion_length": 183.625, "epoch": 0.0065, "grad_norm": 0.36789894104003906, "kl": 0.0005746211390942335, "learning_rate": 3.25e-07, "loss": 0.0, "reward": 0.65625, "reward_std": 0.6399986147880554, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.5, "step": 26 }, { "completion_length": 249.75, "epoch": 0.00675, "grad_norm": 0.34738096594810486, "kl": 0.0005474050994962454, "learning_rate": 3.3750000000000005e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.5786375403404236, "rewards/_accuracy_reward": 0.0625, "rewards/_format_reward": 0.25, "step": 27 }, { "completion_length": 225.125, "epoch": 0.007, "grad_norm": 0.40637290477752686, "kl": 0.0006180583732202649, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 28 }, { "completion_length": 220.375, "epoch": 0.00725, "grad_norm": 0.3404392600059509, "kl": 0.0006671351147815585, "learning_rate": 3.625e-07, "loss": 0.0, "reward": 0.875, "reward_std": 0.9910312294960022, "rewards/_accuracy_reward": 0.5, "rewards/_format_reward": 0.375, "step": 29 }, { "completion_length": 217.75, "epoch": 0.0075, "grad_norm": 0.3346198797225952, "kl": 0.0005711165722459555, "learning_rate": 3.75e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.5250425338745117, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.125, "step": 30 }, { "completion_length": 203.75, "epoch": 0.00775, "grad_norm": 0.3994467854499817, "kl": 0.0006567926029674709, "learning_rate": 3.8750000000000005e-07, "loss": 0.0, "reward": 0.9375, "reward_std": 0.8530408143997192, "rewards/_accuracy_reward": 0.4375, "rewards/_format_reward": 0.5, "step": 31 }, { "completion_length": 245.25, "epoch": 0.008, "grad_norm": 0.3885516822338104, "kl": 0.0007667718455195427, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": 0.15625, "reward_std": 0.4419417381286621, "rewards/_accuracy_reward": 0.03125, "rewards/_format_reward": 0.125, "step": 32 }, { "completion_length": 142.5, "epoch": 0.00825, "grad_norm": 0.5926032066345215, "kl": 0.0007175234495662153, "learning_rate": 4.125000000000001e-07, "loss": 0.0, "reward": 0.78125, "reward_std": 0.6469364762306213, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.625, "step": 33 }, { "completion_length": 256.0, "epoch": 0.0085, "grad_norm": 0.31638824939727783, "kl": 0.0005166275659576058, "learning_rate": 4.2500000000000006e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 34 }, { "completion_length": 135.75, "epoch": 0.00875, "grad_norm": 0.48894402384757996, "kl": 0.0006614525336772203, "learning_rate": 4.375e-07, "loss": 0.0, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 35 }, { "completion_length": 199.875, "epoch": 0.009, "grad_norm": 0.35523107647895813, "kl": 0.0007132225437089801, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": 1.256250023841858, "reward_std": 1.0265884399414062, "rewards/_accuracy_reward": 0.6312500238418579, "rewards/_format_reward": 0.625, "step": 36 }, { "completion_length": 231.25, "epoch": 0.00925, "grad_norm": 0.35712504386901855, "kl": 0.0005631354288198054, "learning_rate": 4.625e-07, "loss": 0.0, "reward": 0.34375, "reward_std": 0.5659615993499756, "rewards/_accuracy_reward": 0.09375, "rewards/_format_reward": 0.25, "step": 37 }, { "completion_length": 128.75, "epoch": 0.0095, "grad_norm": 0.48980286717414856, "kl": 0.0007973555475473404, "learning_rate": 4.7500000000000006e-07, "loss": 0.0, "reward": 1.15625, "reward_std": 0.9904679656028748, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 0.625, "step": 38 }, { "completion_length": 119.875, "epoch": 0.00975, "grad_norm": 0.4735913872718811, "kl": 0.0006368064787238836, "learning_rate": 4.875000000000001e-07, "loss": 0.0, "reward": 0.9375, "reward_std": 0.5786375403404236, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.75, "step": 39 }, { "completion_length": 238.25, "epoch": 0.01, "grad_norm": 0.0012446728069335222, "kl": 0.0006722843972966075, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/_accuracy_reward": 0.0, "rewards/_format_reward": 0.0, "step": 40 }, { "completion_length": 175.625, "epoch": 0.01025, "grad_norm": 0.403473436832428, "kl": 0.0006806729943491518, "learning_rate": 5.125e-07, "loss": 0.0, "reward": 1.225000023841858, "reward_std": 0.782395601272583, "rewards/_accuracy_reward": 0.4749999940395355, "rewards/_format_reward": 0.75, "step": 41 }, { "completion_length": 231.875, "epoch": 0.0105, "grad_norm": 0.49770158529281616, "kl": 0.0007452387944795191, "learning_rate": 5.250000000000001e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.5786375403404236, "rewards/_accuracy_reward": 0.0625, "rewards/_format_reward": 0.25, "step": 42 }, { "completion_length": 153.375, "epoch": 0.01075, "grad_norm": 0.4276140332221985, "kl": 0.0006472233217209578, "learning_rate": 5.375e-07, "loss": 0.0, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 43 }, { "completion_length": 253.125, "epoch": 0.011, "grad_norm": 0.3752562701702118, "kl": 0.0006933521945029497, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 44 }, { "completion_length": 238.0, "epoch": 0.01125, "grad_norm": 0.36442264914512634, "kl": 0.0006443914026021957, "learning_rate": 5.625e-07, "loss": 0.0, "reward": 0.26374998688697815, "reward_std": 0.4852962791919708, "rewards/_accuracy_reward": 0.013749999925494194, "rewards/_format_reward": 0.25, "step": 45 }, { "completion_length": 245.25, "epoch": 0.0115, "grad_norm": 0.3032134473323822, "kl": 0.0007067061378620565, "learning_rate": 5.750000000000001e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 46 }, { "completion_length": 244.0, "epoch": 0.01175, "grad_norm": 0.29296252131462097, "kl": 0.0005294690490700305, "learning_rate": 5.875e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 47 }, { "completion_length": 217.875, "epoch": 0.012, "grad_norm": 0.452722430229187, "kl": 0.0006610968266613781, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.6312500238418579, "reward_std": 0.9192146062850952, "rewards/_accuracy_reward": 0.2562499940395355, "rewards/_format_reward": 0.375, "step": 48 }, { "completion_length": 212.75, "epoch": 0.01225, "grad_norm": 0.3818938136100769, "kl": 0.000738343340344727, "learning_rate": 6.125000000000001e-07, "loss": 0.0, "reward": 0.59375, "reward_std": 0.7898632287979126, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.375, "step": 49 }, { "completion_length": 253.75, "epoch": 0.0125, "grad_norm": 0.3251837491989136, "kl": 0.0006078524165786803, "learning_rate": 6.25e-07, "loss": 0.0, "reward": 0.13124999403953552, "reward_std": 0.3712310194969177, "rewards/_accuracy_reward": 0.0062500000931322575, "rewards/_format_reward": 0.125, "step": 50 }, { "completion_length": 243.0, "epoch": 0.01275, "grad_norm": 0.5190939903259277, "kl": 0.0007083836244419217, "learning_rate": 6.375e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.125, "step": 51 }, { "completion_length": 238.875, "epoch": 0.013, "grad_norm": 0.28036755323410034, "kl": 0.0006517173023894429, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 0.14374998211860657, "reward_std": 0.3668762743473053, "rewards/_accuracy_reward": 0.01875000074505806, "rewards/_format_reward": 0.125, "step": 52 }, { "completion_length": 246.75, "epoch": 0.01325, "grad_norm": 0.35812488198280334, "kl": 0.0008360664360225201, "learning_rate": 6.625000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.0, "step": 53 }, { "completion_length": 224.125, "epoch": 0.0135, "grad_norm": 0.6414341926574707, "kl": 0.0006303292466327548, "learning_rate": 6.750000000000001e-07, "loss": 0.0, "reward": 0.13124999403953552, "reward_std": 0.35146379470825195, "rewards/_accuracy_reward": 0.13124999403953552, "rewards/_format_reward": 0.0, "step": 54 }, { "completion_length": 179.25, "epoch": 0.01375, "grad_norm": 0.48622655868530273, "kl": 0.0005767670809291303, "learning_rate": 6.875000000000001e-07, "loss": 0.0, "reward": 1.0625, "reward_std": 0.933025062084198, "rewards/_accuracy_reward": 0.4375, "rewards/_format_reward": 0.625, "step": 55 }, { "completion_length": 249.875, "epoch": 0.014, "grad_norm": 0.3496423661708832, "kl": 0.0007510894211009145, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 56 }, { "completion_length": 203.625, "epoch": 0.01425, "grad_norm": 0.4185102880001068, "kl": 0.0006905872724018991, "learning_rate": 7.125e-07, "loss": 0.0, "reward": 0.4437499940395355, "reward_std": 0.6155354380607605, "rewards/_accuracy_reward": 0.06875000149011612, "rewards/_format_reward": 0.375, "step": 57 }, { "completion_length": 168.125, "epoch": 0.0145, "grad_norm": 0.4493039846420288, "kl": 0.0008549755439162254, "learning_rate": 7.25e-07, "loss": 0.0, "reward": 1.0625, "reward_std": 0.933025062084198, "rewards/_accuracy_reward": 0.4375, "rewards/_format_reward": 0.625, "step": 58 }, { "completion_length": 191.375, "epoch": 0.01475, "grad_norm": 0.47024667263031006, "kl": 0.0006958736339583993, "learning_rate": 7.375e-07, "loss": 0.0, "reward": 0.9375, "reward_std": 0.8530408143997192, "rewards/_accuracy_reward": 0.4375, "rewards/_format_reward": 0.5, "step": 59 }, { "completion_length": 221.875, "epoch": 0.015, "grad_norm": 0.38048815727233887, "kl": 0.0005862182006239891, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 0.39374998211860657, "reward_std": 0.5434265732765198, "rewards/_accuracy_reward": 0.01875000074505806, "rewards/_format_reward": 0.375, "step": 60 }, { "completion_length": 168.375, "epoch": 0.01525, "grad_norm": 0.4688246548175812, "kl": 0.0007134783663786948, "learning_rate": 7.625e-07, "loss": 0.0, "reward": 0.762499988079071, "reward_std": 0.5736786127090454, "rewards/_accuracy_reward": 0.13750000298023224, "rewards/_format_reward": 0.625, "step": 61 }, { "completion_length": 223.375, "epoch": 0.0155, "grad_norm": 0.42805564403533936, "kl": 0.0006862103473395109, "learning_rate": 7.750000000000001e-07, "loss": 0.0, "reward": 0.15625, "reward_std": 0.4419417381286621, "rewards/_accuracy_reward": 0.03125, "rewards/_format_reward": 0.125, "step": 62 }, { "completion_length": 134.5, "epoch": 0.01575, "grad_norm": 0.3511326014995575, "kl": 0.0005350976716727018, "learning_rate": 7.875000000000001e-07, "loss": 0.0, "reward": 1.09375, "reward_std": 0.4419417381286621, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.875, "step": 63 }, { "completion_length": 179.5, "epoch": 0.016, "grad_norm": 0.3511122763156891, "kl": 0.0006030978402122855, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 0.7999999523162842, "reward_std": 0.7319250702857971, "rewards/_accuracy_reward": 0.17500001192092896, "rewards/_format_reward": 0.625, "step": 64 }, { "completion_length": 186.625, "epoch": 0.01625, "grad_norm": 0.40380579233169556, "kl": 0.0007325903279706836, "learning_rate": 8.125000000000001e-07, "loss": 0.0, "reward": 0.71875, "reward_std": 0.8066409826278687, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.5, "step": 65 }, { "completion_length": 243.75, "epoch": 0.0165, "grad_norm": 0.32375988364219666, "kl": 0.0006421853322535753, "learning_rate": 8.250000000000001e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 66 }, { "completion_length": 227.625, "epoch": 0.01675, "grad_norm": 0.320794939994812, "kl": 0.000636743672657758, "learning_rate": 8.375000000000001e-07, "loss": 0.0, "reward": 0.8125, "reward_std": 0.9136856198310852, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 0.5, "step": 67 }, { "completion_length": 212.0, "epoch": 0.017, "grad_norm": 0.3829336166381836, "kl": 0.0007118759676814079, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "reward": 0.71875, "reward_std": 0.8066409826278687, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.5, "step": 68 }, { "completion_length": 242.0, "epoch": 0.01725, "grad_norm": 0.3430902361869812, "kl": 0.0006913796532899141, "learning_rate": 8.625e-07, "loss": 0.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/_accuracy_reward": 0.03125, "rewards/_format_reward": 0.0, "step": 69 }, { "completion_length": 238.5, "epoch": 0.0175, "grad_norm": 0.3244885504245758, "kl": 0.0006061598542146385, "learning_rate": 8.75e-07, "loss": 0.0, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 70 }, { "completion_length": 142.625, "epoch": 0.01775, "grad_norm": 0.5040633678436279, "kl": 0.0006145286024548113, "learning_rate": 8.875000000000001e-07, "loss": 0.0, "reward": 0.875, "reward_std": 0.5175491571426392, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.625, "step": 71 }, { "completion_length": 256.0, "epoch": 0.018, "grad_norm": 0.0008479771786369383, "kl": 0.00054691091645509, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/_accuracy_reward": 0.0, "rewards/_format_reward": 0.0, "step": 72 }, { "completion_length": 203.75, "epoch": 0.01825, "grad_norm": 0.441141813993454, "kl": 0.0006427373737096786, "learning_rate": 9.125e-07, "loss": 0.0, "reward": 0.5625, "reward_std": 0.7647361755371094, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 0.25, "step": 73 }, { "completion_length": 213.125, "epoch": 0.0185, "grad_norm": 0.4041379392147064, "kl": 0.0007986929267644882, "learning_rate": 9.25e-07, "loss": 0.0, "reward": 0.2749999761581421, "reward_std": 0.47883784770965576, "rewards/_accuracy_reward": 0.02500000037252903, "rewards/_format_reward": 0.25, "step": 74 }, { "completion_length": 240.625, "epoch": 0.01875, "grad_norm": 0.3502941429615021, "kl": 0.0006043448811396956, "learning_rate": 9.375000000000001e-07, "loss": 0.0, "reward": 0.40625, "reward_std": 0.7784771919250488, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.25, "step": 75 }, { "completion_length": 238.75, "epoch": 0.019, "grad_norm": 0.36953458189964294, "kl": 0.0006860418943688273, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 76 }, { "completion_length": 234.625, "epoch": 0.01925, "grad_norm": 0.3768980801105499, "kl": 0.000608474132604897, "learning_rate": 9.625e-07, "loss": 0.0, "reward": 0.26249998807907104, "reward_std": 0.48605549335479736, "rewards/_accuracy_reward": 0.012500000186264515, "rewards/_format_reward": 0.25, "step": 77 }, { "completion_length": 252.875, "epoch": 0.0195, "grad_norm": 0.3738728165626526, "kl": 0.0006259999936446548, "learning_rate": 9.750000000000002e-07, "loss": 0.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/_accuracy_reward": 0.03125, "rewards/_format_reward": 0.0, "step": 78 }, { "completion_length": 186.625, "epoch": 0.01975, "grad_norm": 0.3504292666912079, "kl": 0.0006338249077089131, "learning_rate": 9.875e-07, "loss": 0.0, "reward": 1.2625000476837158, "reward_std": 0.7024192810058594, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.625, "step": 79 }, { "completion_length": 193.0, "epoch": 0.02, "grad_norm": 0.3745087683200836, "kl": 0.000602383108343929, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 0.96875, "reward_std": 0.8602066040039062, "rewards/_accuracy_reward": 0.34375, "rewards/_format_reward": 0.625, "step": 80 }, { "completion_length": 214.0, "epoch": 0.02025, "grad_norm": 0.327633798122406, "kl": 0.0005732266581617296, "learning_rate": 1.0125e-06, "loss": 0.0, "reward": 0.78125, "reward_std": 0.900768518447876, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.375, "step": 81 }, { "completion_length": 217.875, "epoch": 0.0205, "grad_norm": 0.43696385622024536, "kl": 0.000646731466986239, "learning_rate": 1.025e-06, "loss": 0.0, "reward": 0.512499988079071, "reward_std": 0.765669584274292, "rewards/_accuracy_reward": 0.13749998807907104, "rewards/_format_reward": 0.375, "step": 82 }, { "completion_length": 189.125, "epoch": 0.02075, "grad_norm": 0.44965532422065735, "kl": 0.0007412461563944817, "learning_rate": 1.0375e-06, "loss": 0.0, "reward": 0.90625, "reward_std": 0.9994418025016785, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.5, "step": 83 }, { "completion_length": 182.5, "epoch": 0.021, "grad_norm": 0.32483235001564026, "kl": 0.0006356970407068729, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "reward": 1.375, "reward_std": 0.6681531071662903, "rewards/_accuracy_reward": 0.5, "rewards/_format_reward": 0.875, "step": 84 }, { "completion_length": 218.375, "epoch": 0.02125, "grad_norm": 0.36401814222335815, "kl": 0.0006372305797412992, "learning_rate": 1.0625e-06, "loss": 0.0, "reward": 0.5625, "reward_std": 0.810092568397522, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.375, "step": 85 }, { "completion_length": 192.875, "epoch": 0.0215, "grad_norm": 0.40159401297569275, "kl": 0.0006454067770391703, "learning_rate": 1.075e-06, "loss": 0.0, "reward": 0.625, "reward_std": 0.6681531071662903, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.5, "step": 86 }, { "completion_length": 199.375, "epoch": 0.02175, "grad_norm": 0.3774471879005432, "kl": 0.000684591883327812, "learning_rate": 1.0875000000000002e-06, "loss": 0.0, "reward": 1.0, "reward_std": 1.0690449476242065, "rewards/_accuracy_reward": 0.5, "rewards/_format_reward": 0.5, "step": 87 }, { "completion_length": 235.25, "epoch": 0.022, "grad_norm": 0.4988541901111603, "kl": 0.0005781830986961722, "learning_rate": 1.1e-06, "loss": 0.0, "reward": 0.13749998807907104, "reward_std": 0.3691205382347107, "rewards/_accuracy_reward": 0.012500000186264515, "rewards/_format_reward": 0.125, "step": 88 }, { "completion_length": 231.0, "epoch": 0.02225, "grad_norm": 0.3462050259113312, "kl": 0.0006304323324002326, "learning_rate": 1.1125000000000001e-06, "loss": 0.0, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 89 }, { "completion_length": 240.375, "epoch": 0.0225, "grad_norm": 0.366056352853775, "kl": 0.000524615403264761, "learning_rate": 1.125e-06, "loss": 0.0, "reward": 0.3125, "reward_std": 0.5786375403404236, "rewards/_accuracy_reward": 0.0625, "rewards/_format_reward": 0.25, "step": 90 }, { "completion_length": 212.125, "epoch": 0.02275, "grad_norm": 0.38316699862480164, "kl": 0.0006456922856159508, "learning_rate": 1.1375000000000001e-06, "loss": 0.0, "reward": 0.5625, "reward_std": 0.810092568397522, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.375, "step": 91 }, { "completion_length": 173.875, "epoch": 0.023, "grad_norm": 0.4823635518550873, "kl": 0.0008208313374780118, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "reward": 1.2625000476837158, "reward_std": 0.8826704621315002, "rewards/_accuracy_reward": 0.512499988079071, "rewards/_format_reward": 0.75, "step": 92 }, { "completion_length": 256.0, "epoch": 0.02325, "grad_norm": 0.0010727356420829892, "kl": 0.0005186050548218191, "learning_rate": 1.1625e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/_accuracy_reward": 0.0, "rewards/_format_reward": 0.0, "step": 93 }, { "completion_length": 204.375, "epoch": 0.0235, "grad_norm": 0.4184603691101074, "kl": 0.0008142682490870357, "learning_rate": 1.175e-06, "loss": 0.0, "reward": 1.1875, "reward_std": 0.831843912601471, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.625, "step": 94 }, { "completion_length": 196.5, "epoch": 0.02375, "grad_norm": 0.465569406747818, "kl": 0.0008174768299795687, "learning_rate": 1.1875e-06, "loss": 0.0, "reward": 1.0012500286102295, "reward_std": 0.6813943386077881, "rewards/_accuracy_reward": 0.2512499988079071, "rewards/_format_reward": 0.75, "step": 95 }, { "completion_length": 236.625, "epoch": 0.024, "grad_norm": 0.4273647367954254, "kl": 0.0006561095942743123, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 96 }, { "completion_length": 210.75, "epoch": 0.02425, "grad_norm": 0.4174526035785675, "kl": 0.0006856574909761548, "learning_rate": 1.2125e-06, "loss": 0.0, "reward": 0.65625, "reward_std": 0.8957987427711487, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.25, "step": 97 }, { "completion_length": 239.875, "epoch": 0.0245, "grad_norm": 0.3910273611545563, "kl": 0.0006794078508391976, "learning_rate": 1.2250000000000001e-06, "loss": 0.0, "reward": 0.40625, "reward_std": 0.7784771919250488, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.25, "step": 98 }, { "completion_length": 230.625, "epoch": 0.02475, "grad_norm": 0.3605254888534546, "kl": 0.0006890887161716819, "learning_rate": 1.2375e-06, "loss": 0.0, "reward": 0.2562499940395355, "reward_std": 0.7047986388206482, "rewards/_accuracy_reward": 0.13124999403953552, "rewards/_format_reward": 0.125, "step": 99 }, { "completion_length": 232.5, "epoch": 0.025, "grad_norm": 0.374227911233902, "kl": 0.0006080594030208886, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 0.26374998688697815, "reward_std": 0.47337502241134644, "rewards/_accuracy_reward": 0.013749999925494194, "rewards/_format_reward": 0.25, "step": 100 }, { "completion_length": 246.125, "epoch": 0.02525, "grad_norm": 0.480892539024353, "kl": 0.0007099907961674035, "learning_rate": 1.2625000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.0, "step": 101 }, { "completion_length": 233.375, "epoch": 0.0255, "grad_norm": 0.34077897667884827, "kl": 0.0006432888912968338, "learning_rate": 1.275e-06, "loss": 0.0, "reward": 0.53125, "reward_std": 0.9106267690658569, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.25, "step": 102 }, { "completion_length": 122.875, "epoch": 0.02575, "grad_norm": 0.460584819316864, "kl": 0.0007099162903614342, "learning_rate": 1.2875000000000002e-06, "loss": 0.0, "reward": 1.2512500286102295, "reward_std": 0.7314454913139343, "rewards/_accuracy_reward": 0.5012500286102295, "rewards/_format_reward": 0.75, "step": 103 }, { "completion_length": 202.875, "epoch": 0.026, "grad_norm": 0.4294702410697937, "kl": 0.0006664457614533603, "learning_rate": 1.3e-06, "loss": 0.0, "reward": 0.6937500238418579, "reward_std": 0.7907670140266418, "rewards/_accuracy_reward": 0.19374999403953552, "rewards/_format_reward": 0.5, "step": 104 }, { "completion_length": 177.25, "epoch": 0.02625, "grad_norm": 0.40910154581069946, "kl": 0.000768951780628413, "learning_rate": 1.3125000000000001e-06, "loss": 0.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.625, "step": 105 }, { "completion_length": 146.5, "epoch": 0.0265, "grad_norm": 0.45140013098716736, "kl": 0.0007151229656301439, "learning_rate": 1.3250000000000002e-06, "loss": 0.0, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 106 }, { "completion_length": 245.375, "epoch": 0.02675, "grad_norm": 0.35899364948272705, "kl": 0.000733047432731837, "learning_rate": 1.3375000000000001e-06, "loss": 0.0, "reward": 0.75, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.375, "rewards/_format_reward": 0.375, "step": 107 }, { "completion_length": 231.875, "epoch": 0.027, "grad_norm": 0.3983488082885742, "kl": 0.0006859501008875668, "learning_rate": 1.3500000000000002e-06, "loss": 0.0, "reward": 0.375, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.125, "step": 108 }, { "completion_length": 236.5, "epoch": 0.02725, "grad_norm": 0.3864477574825287, "kl": 0.0007214280776679516, "learning_rate": 1.3625000000000003e-06, "loss": 0.0, "reward": 0.29374998807907104, "reward_std": 0.5314652919769287, "rewards/_accuracy_reward": 0.04374999925494194, "rewards/_format_reward": 0.25, "step": 109 }, { "completion_length": 105.75, "epoch": 0.0275, "grad_norm": 0.48207709193229675, "kl": 0.0006837123655714095, "learning_rate": 1.3750000000000002e-06, "loss": 0.0, "reward": 1.162500023841858, "reward_std": 0.548211395740509, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 0.875, "step": 110 }, { "completion_length": 242.5, "epoch": 0.02775, "grad_norm": 0.3625413775444031, "kl": 0.0006246070261113346, "learning_rate": 1.3875000000000003e-06, "loss": 0.0, "reward": 0.26249998807907104, "reward_std": 0.48605549335479736, "rewards/_accuracy_reward": 0.012500000186264515, "rewards/_format_reward": 0.25, "step": 111 }, { "completion_length": 207.875, "epoch": 0.028, "grad_norm": 0.40663740038871765, "kl": 0.0007137281936593354, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 0.90625, "reward_std": 0.9994418025016785, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.5, "step": 112 }, { "completion_length": 236.875, "epoch": 0.02825, "grad_norm": 0.4253649413585663, "kl": 0.0006713285110890865, "learning_rate": 1.4125e-06, "loss": 0.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 113 }, { "completion_length": 178.125, "epoch": 0.0285, "grad_norm": 0.390341579914093, "kl": 0.0006668589194305241, "learning_rate": 1.425e-06, "loss": 0.0, "reward": 1.193750023841858, "reward_std": 0.8304204940795898, "rewards/_accuracy_reward": 0.4437499940395355, "rewards/_format_reward": 0.75, "step": 114 }, { "completion_length": 218.875, "epoch": 0.02875, "grad_norm": 0.472661554813385, "kl": 0.0006868990603834391, "learning_rate": 1.4375e-06, "loss": 0.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 115 }, { "completion_length": 232.875, "epoch": 0.029, "grad_norm": 0.34683483839035034, "kl": 0.000628342037089169, "learning_rate": 1.45e-06, "loss": 0.0, "reward": 0.5325000286102295, "reward_std": 0.7850523591041565, "rewards/_accuracy_reward": 0.1574999988079071, "rewards/_format_reward": 0.375, "step": 116 }, { "completion_length": 236.625, "epoch": 0.02925, "grad_norm": 0.3442273736000061, "kl": 0.000823180191218853, "learning_rate": 1.4625e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 117 }, { "completion_length": 238.375, "epoch": 0.0295, "grad_norm": 0.35203394293785095, "kl": 0.0007420446490868926, "learning_rate": 1.475e-06, "loss": 0.0, "reward": 0.6312500238418579, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.5062500238418579, "rewards/_format_reward": 0.125, "step": 118 }, { "completion_length": 236.75, "epoch": 0.02975, "grad_norm": 0.411632239818573, "kl": 0.0007851931732147932, "learning_rate": 1.4875000000000002e-06, "loss": 0.0, "reward": 0.40625, "reward_std": 0.7784771919250488, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.25, "step": 119 }, { "completion_length": 230.0, "epoch": 0.03, "grad_norm": 0.35817092657089233, "kl": 0.0006097652949392796, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 0.8125, "reward_std": 0.9136856198310852, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 0.5, "step": 120 }, { "completion_length": 226.125, "epoch": 0.03025, "grad_norm": 0.36551421880722046, "kl": 0.0007124023977667093, "learning_rate": 1.5125000000000001e-06, "loss": 0.0, "reward": 0.5625, "reward_std": 0.7647361755371094, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 0.25, "step": 121 }, { "completion_length": 168.875, "epoch": 0.0305, "grad_norm": 0.4331101179122925, "kl": 0.0007970595033839345, "learning_rate": 1.525e-06, "loss": 0.0, "reward": 1.3875000476837158, "reward_std": 0.3879893124103546, "rewards/_accuracy_reward": 0.38749998807907104, "rewards/_format_reward": 1.0, "step": 122 }, { "completion_length": 241.25, "epoch": 0.03075, "grad_norm": 0.30984166264533997, "kl": 0.0006017238483764231, "learning_rate": 1.5375e-06, "loss": 0.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 123 }, { "completion_length": 215.625, "epoch": 0.031, "grad_norm": 0.4169147312641144, "kl": 0.000758463516831398, "learning_rate": 1.5500000000000002e-06, "loss": 0.0, "reward": 0.2824999988079071, "reward_std": 0.5242885947227478, "rewards/_accuracy_reward": 0.1574999988079071, "rewards/_format_reward": 0.125, "step": 124 }, { "completion_length": 225.0, "epoch": 0.03125, "grad_norm": 0.28472763299942017, "kl": 0.0007155483472160995, "learning_rate": 1.5625e-06, "loss": 0.0, "reward": 0.6312500238418579, "reward_std": 0.9114108085632324, "rewards/_accuracy_reward": 0.3812499940395355, "rewards/_format_reward": 0.25, "step": 125 }, { "completion_length": 192.75, "epoch": 0.0315, "grad_norm": 0.4392932057380676, "kl": 0.0008345048408955336, "learning_rate": 1.5750000000000002e-06, "loss": 0.0, "reward": 0.6000000238418579, "reward_std": 0.6447590589523315, "rewards/_accuracy_reward": 0.10000000149011612, "rewards/_format_reward": 0.5, "step": 126 }, { "completion_length": 193.0, "epoch": 0.03175, "grad_norm": 0.3476108908653259, "kl": 0.000728312530554831, "learning_rate": 1.5875e-06, "loss": 0.0, "reward": 0.9124999642372131, "reward_std": 0.6384971141815186, "rewards/_accuracy_reward": 0.16249999403953552, "rewards/_format_reward": 0.75, "step": 127 }, { "completion_length": 221.875, "epoch": 0.032, "grad_norm": 0.42320966720581055, "kl": 0.0008315700688399374, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 0.5625, "reward_std": 0.810092568397522, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.375, "step": 128 }, { "completion_length": 189.875, "epoch": 0.03225, "grad_norm": 0.3862830102443695, "kl": 0.0008328621624968946, "learning_rate": 1.6125000000000002e-06, "loss": 0.0, "reward": 0.875, "reward_std": 0.7196229100227356, "rewards/_accuracy_reward": 0.375, "rewards/_format_reward": 0.5, "step": 129 }, { "completion_length": 237.875, "epoch": 0.0325, "grad_norm": 0.4434264004230499, "kl": 0.000727821490727365, "learning_rate": 1.6250000000000001e-06, "loss": 0.0, "reward": 0.40625, "reward_std": 0.7784771919250488, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.25, "step": 130 }, { "completion_length": 173.125, "epoch": 0.03275, "grad_norm": 0.6541438102722168, "kl": 0.0010414267890155315, "learning_rate": 1.6375000000000002e-06, "loss": 0.0, "reward": 0.5625, "reward_std": 0.7647361755371094, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 0.25, "step": 131 }, { "completion_length": 140.0, "epoch": 0.033, "grad_norm": 0.4342222511768341, "kl": 0.0008442209218628705, "learning_rate": 1.6500000000000003e-06, "loss": 0.0, "reward": 1.15625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.75, "step": 132 }, { "completion_length": 155.375, "epoch": 0.03325, "grad_norm": 0.4700578451156616, "kl": 0.0009654579916968942, "learning_rate": 1.6625000000000002e-06, "loss": 0.0, "reward": 1.1875, "reward_std": 0.6373774409294128, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.5, "step": 133 }, { "completion_length": 235.125, "epoch": 0.0335, "grad_norm": 0.4328174889087677, "kl": 0.0007693137740716338, "learning_rate": 1.6750000000000003e-06, "loss": 0.0, "reward": 0.3812499940395355, "reward_std": 0.75020831823349, "rewards/_accuracy_reward": 0.13124999403953552, "rewards/_format_reward": 0.25, "step": 134 }, { "completion_length": 256.0, "epoch": 0.03375, "grad_norm": 0.0011643688194453716, "kl": 0.0006938659935258329, "learning_rate": 1.6875000000000001e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/_accuracy_reward": 0.0, "rewards/_format_reward": 0.0, "step": 135 }, { "completion_length": 176.25, "epoch": 0.034, "grad_norm": 0.46144965291023254, "kl": 0.0012229140847921371, "learning_rate": 1.7000000000000002e-06, "loss": 0.0, "reward": 0.42499998211860657, "reward_std": 0.5763183832168579, "rewards/_accuracy_reward": 0.05000000447034836, "rewards/_format_reward": 0.375, "step": 136 }, { "completion_length": 108.25, "epoch": 0.03425, "grad_norm": 0.641877293586731, "kl": 0.000889874529093504, "learning_rate": 1.7125000000000003e-06, "loss": 0.0, "reward": 1.1375000476837158, "reward_std": 0.8327021598815918, "rewards/_accuracy_reward": 0.512499988079071, "rewards/_format_reward": 0.625, "step": 137 }, { "completion_length": 243.875, "epoch": 0.0345, "grad_norm": 0.4593142569065094, "kl": 0.000851454387884587, "learning_rate": 1.725e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 138 }, { "completion_length": 166.75, "epoch": 0.03475, "grad_norm": 0.37833383679389954, "kl": 0.000998564064502716, "learning_rate": 1.7375e-06, "loss": 0.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.625, "step": 139 }, { "completion_length": 155.5, "epoch": 0.035, "grad_norm": 0.48516976833343506, "kl": 0.0008888572920113802, "learning_rate": 1.75e-06, "loss": 0.0, "reward": 0.9812500476837158, "reward_std": 0.6750330924987793, "rewards/_accuracy_reward": 0.23124998807907104, "rewards/_format_reward": 0.75, "step": 140 }, { "completion_length": 140.0, "epoch": 0.03525, "grad_norm": 0.4754278063774109, "kl": 0.001002427306957543, "learning_rate": 1.7625e-06, "loss": 0.0, "reward": 1.34375, "reward_std": 0.8230767846107483, "rewards/_accuracy_reward": 0.59375, "rewards/_format_reward": 0.75, "step": 141 }, { "completion_length": 221.875, "epoch": 0.0355, "grad_norm": 0.5134493708610535, "kl": 0.0009355404181405902, "learning_rate": 1.7750000000000002e-06, "loss": 0.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 142 }, { "completion_length": 204.0, "epoch": 0.03575, "grad_norm": 0.41457489132881165, "kl": 0.001152246375568211, "learning_rate": 1.7875e-06, "loss": 0.0, "reward": 0.2562499940395355, "reward_std": 0.7047985792160034, "rewards/_accuracy_reward": 0.13124999403953552, "rewards/_format_reward": 0.125, "step": 143 }, { "completion_length": 241.75, "epoch": 0.036, "grad_norm": 0.37025943398475647, "kl": 0.0008789349813014269, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 144 }, { "completion_length": 145.25, "epoch": 0.03625, "grad_norm": 0.6227016448974609, "kl": 0.0012156821321696043, "learning_rate": 1.8125e-06, "loss": 0.0, "reward": 0.9387500286102295, "reward_std": 0.8531527519226074, "rewards/_accuracy_reward": 0.3137499988079071, "rewards/_format_reward": 0.625, "step": 145 }, { "completion_length": 194.25, "epoch": 0.0365, "grad_norm": 0.40095922350883484, "kl": 0.0010887724347412586, "learning_rate": 1.825e-06, "loss": 0.0, "reward": 1.193750023841858, "reward_std": 0.8304204940795898, "rewards/_accuracy_reward": 0.4437499940395355, "rewards/_format_reward": 0.75, "step": 146 }, { "completion_length": 210.5, "epoch": 0.03675, "grad_norm": 0.3590393662452698, "kl": 0.001111085875891149, "learning_rate": 1.8375000000000002e-06, "loss": 0.0, "reward": 0.6312500238418579, "reward_std": 0.6169843077659607, "rewards/_accuracy_reward": 0.13124999403953552, "rewards/_format_reward": 0.5, "step": 147 }, { "completion_length": 207.25, "epoch": 0.037, "grad_norm": 0.46096640825271606, "kl": 0.0010449312394484878, "learning_rate": 1.85e-06, "loss": 0.0, "reward": 0.8887499570846558, "reward_std": 0.8371795415878296, "rewards/_accuracy_reward": 0.26374998688697815, "rewards/_format_reward": 0.625, "step": 148 }, { "completion_length": 245.125, "epoch": 0.03725, "grad_norm": 0.3660503327846527, "kl": 0.0008743983926251531, "learning_rate": 1.8625000000000002e-06, "loss": 0.0, "reward": 0.3812499940395355, "reward_std": 0.7502082586288452, "rewards/_accuracy_reward": 0.13124999403953552, "rewards/_format_reward": 0.25, "step": 149 }, { "completion_length": 204.75, "epoch": 0.0375, "grad_norm": 0.3448229730129242, "kl": 0.0009303970145992935, "learning_rate": 1.8750000000000003e-06, "loss": 0.0, "reward": 1.1324999332427979, "reward_std": 0.549304723739624, "rewards/_accuracy_reward": 0.2574999928474426, "rewards/_format_reward": 0.875, "step": 150 }, { "completion_length": 201.125, "epoch": 0.03775, "grad_norm": 0.4772341847419739, "kl": 0.0015494409017264843, "learning_rate": 1.8875000000000001e-06, "loss": 0.0001, "reward": 0.3812499940395355, "reward_std": 0.75020831823349, "rewards/_accuracy_reward": 0.13124999403953552, "rewards/_format_reward": 0.25, "step": 151 }, { "completion_length": 168.5, "epoch": 0.038, "grad_norm": 0.5399798154830933, "kl": 0.001348810619674623, "learning_rate": 1.9000000000000002e-06, "loss": 0.0001, "reward": 1.1437499523162842, "reward_std": 0.8317097425460815, "rewards/_accuracy_reward": 0.39374998211860657, "rewards/_format_reward": 0.75, "step": 152 }, { "completion_length": 223.625, "epoch": 0.03825, "grad_norm": 0.36858677864074707, "kl": 0.0011914662318304181, "learning_rate": 1.9125000000000003e-06, "loss": 0.0, "reward": 0.8812500238418579, "reward_std": 0.9920892715454102, "rewards/_accuracy_reward": 0.3812499940395355, "rewards/_format_reward": 0.5, "step": 153 }, { "completion_length": 179.0, "epoch": 0.0385, "grad_norm": 0.4605531394481659, "kl": 0.0013787942007184029, "learning_rate": 1.925e-06, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.9994418025016785, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.5, "step": 154 }, { "completion_length": 252.25, "epoch": 0.03875, "grad_norm": 0.4210733473300934, "kl": 0.0009038643911480904, "learning_rate": 1.9375e-06, "loss": 0.0, "reward": 0.15625, "reward_std": 0.4419417381286621, "rewards/_accuracy_reward": 0.03125, "rewards/_format_reward": 0.125, "step": 155 }, { "completion_length": 246.0, "epoch": 0.039, "grad_norm": 0.40100812911987305, "kl": 0.001074227737262845, "learning_rate": 1.9500000000000004e-06, "loss": 0.0, "reward": 0.3762499988079071, "reward_std": 0.7452312707901001, "rewards/_accuracy_reward": 0.1262499988079071, "rewards/_format_reward": 0.25, "step": 156 }, { "completion_length": 215.875, "epoch": 0.03925, "grad_norm": 0.39963439106941223, "kl": 0.0015569372335448861, "learning_rate": 1.9625000000000003e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.5659615993499756, "rewards/_accuracy_reward": 0.09375, "rewards/_format_reward": 0.25, "step": 157 }, { "completion_length": 251.625, "epoch": 0.0395, "grad_norm": 0.34965765476226807, "kl": 0.0011444491101428866, "learning_rate": 1.975e-06, "loss": 0.0, "reward": 0.13124999403953552, "reward_std": 0.3712310194969177, "rewards/_accuracy_reward": 0.0062500000931322575, "rewards/_format_reward": 0.125, "step": 158 }, { "completion_length": 243.75, "epoch": 0.03975, "grad_norm": 0.36649003624916077, "kl": 0.0010651213815435767, "learning_rate": 1.9875000000000005e-06, "loss": 0.0, "reward": 0.5625, "reward_std": 0.810092568397522, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.375, "step": 159 }, { "completion_length": 199.125, "epoch": 0.04, "grad_norm": 0.4516124725341797, "kl": 0.0014434423064813018, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "reward": 0.824999988079071, "reward_std": 0.7449832558631897, "rewards/_accuracy_reward": 0.19999998807907104, "rewards/_format_reward": 0.625, "step": 160 }, { "completion_length": 207.875, "epoch": 0.04025, "grad_norm": 0.35151946544647217, "kl": 0.001188786351121962, "learning_rate": 2.0125000000000002e-06, "loss": 0.0, "reward": 0.5449999570846558, "reward_std": 0.5869290232658386, "rewards/_accuracy_reward": 0.044999998062849045, "rewards/_format_reward": 0.5, "step": 161 }, { "completion_length": 192.625, "epoch": 0.0405, "grad_norm": 0.5956340432167053, "kl": 0.001869518426246941, "learning_rate": 2.025e-06, "loss": 0.0001, "reward": 0.5187499523162842, "reward_std": 0.7610788941383362, "rewards/_accuracy_reward": 0.14374998211860657, "rewards/_format_reward": 0.375, "step": 162 }, { "completion_length": 241.125, "epoch": 0.04075, "grad_norm": 0.35935690999031067, "kl": 0.0010445680236443877, "learning_rate": 2.0375e-06, "loss": 0.0, "reward": 0.40625, "reward_std": 0.7784771919250488, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.25, "step": 163 }, { "completion_length": 166.25, "epoch": 0.041, "grad_norm": 0.36909234523773193, "kl": 0.0019193933112546802, "learning_rate": 2.05e-06, "loss": 0.0001, "reward": 1.181249976158142, "reward_std": 0.6335486769676208, "rewards/_accuracy_reward": 0.3062499761581421, "rewards/_format_reward": 0.875, "step": 164 }, { "completion_length": 228.75, "epoch": 0.04125, "grad_norm": 0.402389794588089, "kl": 0.0016254698857665062, "learning_rate": 2.0625e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 165 }, { "completion_length": 113.625, "epoch": 0.0415, "grad_norm": 0.43517887592315674, "kl": 0.0022651588078588247, "learning_rate": 2.075e-06, "loss": 0.0001, "reward": 1.2937500476837158, "reward_std": 0.29932963848114014, "rewards/_accuracy_reward": 0.29374998807907104, "rewards/_format_reward": 1.0, "step": 166 }, { "completion_length": 184.25, "epoch": 0.04175, "grad_norm": 0.4854840040206909, "kl": 0.00202546757645905, "learning_rate": 2.0875e-06, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.6028207540512085, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.75, "step": 167 }, { "completion_length": 173.0, "epoch": 0.042, "grad_norm": 0.36054983735084534, "kl": 0.0024099252186715603, "learning_rate": 2.1000000000000002e-06, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.6187184453010559, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.875, "step": 168 }, { "completion_length": 214.625, "epoch": 0.04225, "grad_norm": 0.4301230311393738, "kl": 0.0015324982814490795, "learning_rate": 2.1125e-06, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.9994418025016785, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.5, "step": 169 }, { "completion_length": 163.25, "epoch": 0.0425, "grad_norm": 0.4773411154747009, "kl": 0.0024503618478775024, "learning_rate": 2.125e-06, "loss": 0.0001, "reward": 1.0499999523162842, "reward_std": 0.7606388330459595, "rewards/_accuracy_reward": 0.29999998211860657, "rewards/_format_reward": 0.75, "step": 170 }, { "completion_length": 201.0, "epoch": 0.04275, "grad_norm": 0.3986191153526306, "kl": 0.002186427591368556, "learning_rate": 2.1375000000000003e-06, "loss": 0.0001, "reward": 0.6312500238418579, "reward_std": 0.9192145466804504, "rewards/_accuracy_reward": 0.2562499940395355, "rewards/_format_reward": 0.375, "step": 171 }, { "completion_length": 248.875, "epoch": 0.043, "grad_norm": 0.38168638944625854, "kl": 0.002373702824115753, "learning_rate": 2.15e-06, "loss": 0.0001, "reward": 0.3824999928474426, "reward_std": 0.7494902610778809, "rewards/_accuracy_reward": 0.13249999284744263, "rewards/_format_reward": 0.25, "step": 172 }, { "completion_length": 182.875, "epoch": 0.04325, "grad_norm": 0.5253975987434387, "kl": 0.0032906224951148033, "learning_rate": 2.1625e-06, "loss": 0.0001, "reward": 1.037500023841858, "reward_std": 0.9299577474594116, "rewards/_accuracy_reward": 0.4124999940395355, "rewards/_format_reward": 0.625, "step": 173 }, { "completion_length": 143.75, "epoch": 0.0435, "grad_norm": 0.57611483335495, "kl": 0.0019308909540995955, "learning_rate": 2.1750000000000004e-06, "loss": 0.0001, "reward": 1.756250023841858, "reward_std": 0.6894291639328003, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 174 }, { "completion_length": 254.5, "epoch": 0.04375, "grad_norm": 0.34048524498939514, "kl": 0.0014960013795644045, "learning_rate": 2.1875000000000002e-06, "loss": 0.0001, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/_accuracy_reward": 0.0062500000931322575, "rewards/_format_reward": 0.0, "step": 175 }, { "completion_length": 230.875, "epoch": 0.044, "grad_norm": 0.39282166957855225, "kl": 0.0017745784716680646, "learning_rate": 2.2e-06, "loss": 0.0001, "reward": 0.8812500238418579, "reward_std": 0.9920892715454102, "rewards/_accuracy_reward": 0.3812499940395355, "rewards/_format_reward": 0.5, "step": 176 }, { "completion_length": 95.125, "epoch": 0.04425, "grad_norm": 0.64809650182724, "kl": 0.0027632713317871094, "learning_rate": 2.2125e-06, "loss": 0.0001, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.75, "step": 177 }, { "completion_length": 235.375, "epoch": 0.0445, "grad_norm": 0.4109288454055786, "kl": 0.0018161768093705177, "learning_rate": 2.2250000000000003e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 178 }, { "completion_length": 239.5, "epoch": 0.04475, "grad_norm": 0.3848976492881775, "kl": 0.0016923128860071301, "learning_rate": 2.2375e-06, "loss": 0.0001, "reward": 0.5249999761581421, "reward_std": 0.5612486004829407, "rewards/_accuracy_reward": 0.02500000037252903, "rewards/_format_reward": 0.5, "step": 179 }, { "completion_length": 234.875, "epoch": 0.045, "grad_norm": 0.4291512072086334, "kl": 0.001589511870406568, "learning_rate": 2.25e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 180 }, { "completion_length": 233.625, "epoch": 0.04525, "grad_norm": 0.5065072774887085, "kl": 0.0018506099004298449, "learning_rate": 2.2625000000000004e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 181 }, { "completion_length": 214.5, "epoch": 0.0455, "grad_norm": 0.4389038383960724, "kl": 0.0023762795608490705, "learning_rate": 2.2750000000000002e-06, "loss": 0.0001, "reward": 0.39374998211860657, "reward_std": 0.5434265732765198, "rewards/_accuracy_reward": 0.01875000074505806, "rewards/_format_reward": 0.375, "step": 182 }, { "completion_length": 210.0, "epoch": 0.04575, "grad_norm": 0.45881783962249756, "kl": 0.001946362666785717, "learning_rate": 2.2875e-06, "loss": 0.0001, "reward": 0.5687500238418579, "reward_std": 0.7690148949623108, "rewards/_accuracy_reward": 0.19374999403953552, "rewards/_format_reward": 0.375, "step": 183 }, { "completion_length": 219.5, "epoch": 0.046, "grad_norm": 0.4343101978302002, "kl": 0.0028733538929373026, "learning_rate": 2.3000000000000004e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.9161254167556763, "rewards/_accuracy_reward": 0.375, "rewards/_format_reward": 0.25, "step": 184 }, { "completion_length": 220.125, "epoch": 0.04625, "grad_norm": 0.4084475338459015, "kl": 0.00223284843377769, "learning_rate": 2.3125000000000003e-06, "loss": 0.0001, "reward": 0.7200000286102295, "reward_std": 0.805374801158905, "rewards/_accuracy_reward": 0.2199999988079071, "rewards/_format_reward": 0.5, "step": 185 }, { "completion_length": 171.625, "epoch": 0.0465, "grad_norm": 0.5016433000564575, "kl": 0.0028451047837734222, "learning_rate": 2.325e-06, "loss": 0.0001, "reward": 1.037500023841858, "reward_std": 0.9299578070640564, "rewards/_accuracy_reward": 0.4124999940395355, "rewards/_format_reward": 0.625, "step": 186 }, { "completion_length": 168.125, "epoch": 0.04675, "grad_norm": 0.3974522650241852, "kl": 0.0034177624620497227, "learning_rate": 2.3375000000000005e-06, "loss": 0.0001, "reward": 1.2937500476837158, "reward_std": 0.6945900917053223, "rewards/_accuracy_reward": 0.543749988079071, "rewards/_format_reward": 0.75, "step": 187 }, { "completion_length": 249.5, "epoch": 0.047, "grad_norm": 0.4274694621562958, "kl": 0.001807468244805932, "learning_rate": 2.35e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 188 }, { "completion_length": 244.5, "epoch": 0.04725, "grad_norm": 0.006730486173182726, "kl": 0.0024024751037359238, "learning_rate": 2.3625000000000003e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/_accuracy_reward": 0.0, "rewards/_format_reward": 0.0, "step": 189 }, { "completion_length": 235.875, "epoch": 0.0475, "grad_norm": 0.4219696521759033, "kl": 0.002536715939640999, "learning_rate": 2.375e-06, "loss": 0.0001, "reward": 0.6000000238418579, "reward_std": 0.6447591185569763, "rewards/_accuracy_reward": 0.10000000149011612, "rewards/_format_reward": 0.5, "step": 190 }, { "completion_length": 202.0, "epoch": 0.04775, "grad_norm": 0.4399944245815277, "kl": 0.0024324413388967514, "learning_rate": 2.3875e-06, "loss": 0.0001, "reward": 1.0, "reward_std": 0.8237544298171997, "rewards/_accuracy_reward": 0.375, "rewards/_format_reward": 0.625, "step": 191 }, { "completion_length": 244.0, "epoch": 0.048, "grad_norm": 0.3966406285762787, "kl": 0.002154730260372162, "learning_rate": 2.4000000000000003e-06, "loss": 0.0001, "reward": 0.40625, "reward_std": 0.7784771919250488, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.25, "step": 192 }, { "completion_length": 214.625, "epoch": 0.04825, "grad_norm": 0.43435680866241455, "kl": 0.002745242090895772, "learning_rate": 2.4125e-06, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.831843912601471, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.625, "step": 193 }, { "completion_length": 171.625, "epoch": 0.0485, "grad_norm": 0.38358110189437866, "kl": 0.0024988814257085323, "learning_rate": 2.425e-06, "loss": 0.0001, "reward": 1.2312500476837158, "reward_std": 0.6284547448158264, "rewards/_accuracy_reward": 0.35624998807907104, "rewards/_format_reward": 0.875, "step": 194 }, { "completion_length": 189.875, "epoch": 0.04875, "grad_norm": 0.44296523928642273, "kl": 0.0033659208565950394, "learning_rate": 2.4375e-06, "loss": 0.0001, "reward": 1.068750023841858, "reward_std": 0.9250240921974182, "rewards/_accuracy_reward": 0.4437499940395355, "rewards/_format_reward": 0.625, "step": 195 }, { "completion_length": 207.125, "epoch": 0.049, "grad_norm": 0.4014991819858551, "kl": 0.004617661237716675, "learning_rate": 2.4500000000000003e-06, "loss": 0.0002, "reward": 0.6437499523162842, "reward_std": 0.7456720471382141, "rewards/_accuracy_reward": 0.26874998211860657, "rewards/_format_reward": 0.375, "step": 196 }, { "completion_length": 229.125, "epoch": 0.04925, "grad_norm": 0.4298330247402191, "kl": 0.0023158607073128223, "learning_rate": 2.4625e-06, "loss": 0.0001, "reward": 0.5375000238418579, "reward_std": 0.7886471748352051, "rewards/_accuracy_reward": 0.16249999403953552, "rewards/_format_reward": 0.375, "step": 197 }, { "completion_length": 152.125, "epoch": 0.0495, "grad_norm": 0.6460537314414978, "kl": 0.004403269849717617, "learning_rate": 2.475e-06, "loss": 0.0002, "reward": 1.693750023841858, "reward_std": 0.42714792490005493, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 1.0, "step": 198 }, { "completion_length": 224.625, "epoch": 0.04975, "grad_norm": 0.3826991617679596, "kl": 0.00469087902456522, "learning_rate": 2.4875000000000003e-06, "loss": 0.0002, "reward": 0.8199999928474426, "reward_std": 0.7337185144424438, "rewards/_accuracy_reward": 0.3199999928474426, "rewards/_format_reward": 0.5, "step": 199 }, { "completion_length": 233.5, "epoch": 0.05, "grad_norm": 0.4221881031990051, "kl": 0.003704667557030916, "learning_rate": 2.5e-06, "loss": 0.0001, "reward": 0.4437499940395355, "reward_std": 0.6155354380607605, "rewards/_accuracy_reward": 0.06875000149011612, "rewards/_format_reward": 0.375, "step": 200 }, { "completion_length": 231.75, "epoch": 0.05025, "grad_norm": 0.43503835797309875, "kl": 0.003693893551826477, "learning_rate": 2.5125e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 201 }, { "completion_length": 138.0, "epoch": 0.0505, "grad_norm": 0.4424368739128113, "kl": 0.005755322519689798, "learning_rate": 2.5250000000000004e-06, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.6373774409294128, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 0.75, "step": 202 }, { "completion_length": 242.875, "epoch": 0.05075, "grad_norm": 0.34338605403900146, "kl": 0.003572634654119611, "learning_rate": 2.5375e-06, "loss": 0.0001, "reward": 0.13749998807907104, "reward_std": 0.3691205382347107, "rewards/_accuracy_reward": 0.012500000186264515, "rewards/_format_reward": 0.125, "step": 203 }, { "completion_length": 147.25, "epoch": 0.051, "grad_norm": 0.5478609204292297, "kl": 0.005715084727853537, "learning_rate": 2.55e-06, "loss": 0.0002, "reward": 1.162500023841858, "reward_std": 0.548211395740509, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 0.875, "step": 204 }, { "completion_length": 208.875, "epoch": 0.05125, "grad_norm": 0.4353210926055908, "kl": 0.006955728400498629, "learning_rate": 2.5625e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.8857755064964294, "rewards/_accuracy_reward": 0.34375, "rewards/_format_reward": 0.5, "step": 205 }, { "completion_length": 183.125, "epoch": 0.0515, "grad_norm": 0.48688188195228577, "kl": 0.005555164068937302, "learning_rate": 2.5750000000000003e-06, "loss": 0.0002, "reward": 1.065000057220459, "reward_std": 0.7645166516304016, "rewards/_accuracy_reward": 0.3149999976158142, "rewards/_format_reward": 0.75, "step": 206 }, { "completion_length": 161.375, "epoch": 0.05175, "grad_norm": 0.5545176267623901, "kl": 0.005829046946018934, "learning_rate": 2.5875000000000002e-06, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 207 }, { "completion_length": 225.0, "epoch": 0.052, "grad_norm": 0.5879900455474854, "kl": 0.0061422791332006454, "learning_rate": 2.6e-06, "loss": 0.0002, "reward": 0.2874999940395355, "reward_std": 0.5350233316421509, "rewards/_accuracy_reward": 0.03750000149011612, "rewards/_format_reward": 0.25, "step": 208 }, { "completion_length": 157.5, "epoch": 0.05225, "grad_norm": 0.42013055086135864, "kl": 0.007088819984346628, "learning_rate": 2.6125e-06, "loss": 0.0003, "reward": 1.537500023841858, "reward_std": 0.7322909235954285, "rewards/_accuracy_reward": 0.6625000238418579, "rewards/_format_reward": 0.875, "step": 209 }, { "completion_length": 191.125, "epoch": 0.0525, "grad_norm": 0.3625972867012024, "kl": 0.0053137680515646935, "learning_rate": 2.6250000000000003e-06, "loss": 0.0002, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 210 }, { "completion_length": 223.0, "epoch": 0.05275, "grad_norm": 0.48341336846351624, "kl": 0.005899759475141764, "learning_rate": 2.6375e-06, "loss": 0.0002, "reward": 0.3137499988079071, "reward_std": 0.5151404142379761, "rewards/_accuracy_reward": 0.0637499988079071, "rewards/_format_reward": 0.25, "step": 211 }, { "completion_length": 219.5, "epoch": 0.053, "grad_norm": 0.46322962641716003, "kl": 0.005710378754884005, "learning_rate": 2.6500000000000005e-06, "loss": 0.0002, "reward": 0.9387500286102295, "reward_std": 0.8531528115272522, "rewards/_accuracy_reward": 0.3137499988079071, "rewards/_format_reward": 0.625, "step": 212 }, { "completion_length": 198.125, "epoch": 0.05325, "grad_norm": 0.5013320446014404, "kl": 0.008783817291259766, "learning_rate": 2.6625e-06, "loss": 0.0004, "reward": 0.7825000286102295, "reward_std": 0.9011222720146179, "rewards/_accuracy_reward": 0.2824999988079071, "rewards/_format_reward": 0.5, "step": 213 }, { "completion_length": 242.625, "epoch": 0.0535, "grad_norm": 0.3888327181339264, "kl": 0.0044942148961126804, "learning_rate": 2.6750000000000002e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.25, "step": 214 }, { "completion_length": 236.75, "epoch": 0.05375, "grad_norm": 0.4056166410446167, "kl": 0.004561163019388914, "learning_rate": 2.6875e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 215 }, { "completion_length": 243.375, "epoch": 0.054, "grad_norm": 0.42273250222206116, "kl": 0.005308468360453844, "learning_rate": 2.7000000000000004e-06, "loss": 0.0002, "reward": 0.6312500238418579, "reward_std": 0.9192146062850952, "rewards/_accuracy_reward": 0.2562499940395355, "rewards/_format_reward": 0.375, "step": 216 }, { "completion_length": 241.125, "epoch": 0.05425, "grad_norm": 0.4731215536594391, "kl": 0.005138865672051907, "learning_rate": 2.7125000000000003e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.375, "rewards/_format_reward": 0.375, "step": 217 }, { "completion_length": 120.625, "epoch": 0.0545, "grad_norm": 0.6542100310325623, "kl": 0.0068123298697173595, "learning_rate": 2.7250000000000006e-06, "loss": 0.0003, "reward": 1.1387500762939453, "reward_std": 0.12017104029655457, "rewards/_accuracy_reward": 0.26374998688697815, "rewards/_format_reward": 0.875, "step": 218 }, { "completion_length": 239.625, "epoch": 0.05475, "grad_norm": 0.5177122950553894, "kl": 0.004553478676825762, "learning_rate": 2.7375e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 0.125, "step": 219 }, { "completion_length": 161.75, "epoch": 0.055, "grad_norm": 0.4810257852077484, "kl": 0.008091006428003311, "learning_rate": 2.7500000000000004e-06, "loss": 0.0003, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 220 }, { "completion_length": 207.0, "epoch": 0.05525, "grad_norm": 0.40970757603645325, "kl": 0.007687645964324474, "learning_rate": 2.7625000000000002e-06, "loss": 0.0003, "reward": 1.024999976158142, "reward_std": 0.7564013004302979, "rewards/_accuracy_reward": 0.2749999761581421, "rewards/_format_reward": 0.75, "step": 221 }, { "completion_length": 154.75, "epoch": 0.0555, "grad_norm": 0.5231997966766357, "kl": 0.009649819694459438, "learning_rate": 2.7750000000000005e-06, "loss": 0.0004, "reward": 1.3125, "reward_std": 0.873723566532135, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.75, "step": 222 }, { "completion_length": 216.375, "epoch": 0.05575, "grad_norm": 0.443153440952301, "kl": 0.007553863804787397, "learning_rate": 2.7875000000000004e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.8066409826278687, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.5, "step": 223 }, { "completion_length": 163.625, "epoch": 0.056, "grad_norm": 0.539243221282959, "kl": 0.009118853136897087, "learning_rate": 2.8000000000000003e-06, "loss": 0.0004, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.625, "step": 224 }, { "completion_length": 157.875, "epoch": 0.05625, "grad_norm": 0.5195901989936829, "kl": 0.009252168238162994, "learning_rate": 2.8125e-06, "loss": 0.0004, "reward": 1.287500023841858, "reward_std": 0.8786149024963379, "rewards/_accuracy_reward": 0.5375000238418579, "rewards/_format_reward": 0.75, "step": 225 }, { "completion_length": 188.75, "epoch": 0.0565, "grad_norm": 0.4762548804283142, "kl": 0.008448407053947449, "learning_rate": 2.825e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.7676494717597961, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 0.625, "step": 226 }, { "completion_length": 184.375, "epoch": 0.05675, "grad_norm": 0.46647635102272034, "kl": 0.01854267716407776, "learning_rate": 2.8375000000000004e-06, "loss": 0.0007, "reward": 0.7875000238418579, "reward_std": 0.7024192810058594, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 0.5, "step": 227 }, { "completion_length": 146.125, "epoch": 0.057, "grad_norm": 0.6877365112304688, "kl": 0.00886484608054161, "learning_rate": 2.85e-06, "loss": 0.0004, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 228 }, { "completion_length": 229.0, "epoch": 0.05725, "grad_norm": 0.5037513375282288, "kl": 0.00627094367519021, "learning_rate": 2.8625e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 229 }, { "completion_length": 129.25, "epoch": 0.0575, "grad_norm": 1.007373571395874, "kl": 0.01517564244568348, "learning_rate": 2.875e-06, "loss": 0.0006, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 230 }, { "completion_length": 243.625, "epoch": 0.05775, "grad_norm": 0.41951367259025574, "kl": 0.007699695415794849, "learning_rate": 2.8875000000000003e-06, "loss": 0.0003, "reward": 0.3812499940395355, "reward_std": 0.7502082586288452, "rewards/_accuracy_reward": 0.13124999403953552, "rewards/_format_reward": 0.25, "step": 231 }, { "completion_length": 198.125, "epoch": 0.058, "grad_norm": 0.3918209969997406, "kl": 0.010249249637126923, "learning_rate": 2.9e-06, "loss": 0.0004, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 232 }, { "completion_length": 213.875, "epoch": 0.05825, "grad_norm": 0.41451263427734375, "kl": 0.008266115561127663, "learning_rate": 2.9125000000000005e-06, "loss": 0.0003, "reward": 1.0125000476837158, "reward_std": 0.9261093735694885, "rewards/_accuracy_reward": 0.38749998807907104, "rewards/_format_reward": 0.625, "step": 233 }, { "completion_length": 160.125, "epoch": 0.0585, "grad_norm": 0.4602366089820862, "kl": 0.011817101389169693, "learning_rate": 2.925e-06, "loss": 0.0005, "reward": 1.1375000476837158, "reward_std": 0.548211395740509, "rewards/_accuracy_reward": 0.26249998807907104, "rewards/_format_reward": 0.875, "step": 234 }, { "completion_length": 150.125, "epoch": 0.05875, "grad_norm": 0.5556851029396057, "kl": 0.012773082591593266, "learning_rate": 2.9375000000000003e-06, "loss": 0.0005, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 235 }, { "completion_length": 236.625, "epoch": 0.059, "grad_norm": 0.3992154002189636, "kl": 0.008578523993492126, "learning_rate": 2.95e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 1.0690449476242065, "rewards/_accuracy_reward": 0.5, "rewards/_format_reward": 0.5, "step": 236 }, { "completion_length": 223.25, "epoch": 0.05925, "grad_norm": 0.4743492901325226, "kl": 0.009763360023498535, "learning_rate": 2.9625000000000004e-06, "loss": 0.0004, "reward": 0.512499988079071, "reward_std": 0.765669584274292, "rewards/_accuracy_reward": 0.13749998807907104, "rewards/_format_reward": 0.375, "step": 237 }, { "completion_length": 232.75, "epoch": 0.0595, "grad_norm": 0.3819025754928589, "kl": 0.009855308569967747, "learning_rate": 2.9750000000000003e-06, "loss": 0.0004, "reward": 0.6687500476837158, "reward_std": 0.7736451625823975, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 0.5, "step": 238 }, { "completion_length": 219.0, "epoch": 0.05975, "grad_norm": 0.6082333922386169, "kl": 0.013843866065144539, "learning_rate": 2.9875e-06, "loss": 0.0006, "reward": 0.65625, "reward_std": 0.9348175525665283, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.375, "step": 239 }, { "completion_length": 243.75, "epoch": 0.06, "grad_norm": 0.40308472514152527, "kl": 0.009744809940457344, "learning_rate": 3e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.375, "rewards/_format_reward": 0.375, "step": 240 }, { "completion_length": 169.375, "epoch": 0.06025, "grad_norm": 0.6102287769317627, "kl": 0.012486739084124565, "learning_rate": 3.0125000000000004e-06, "loss": 0.0005, "reward": 0.971250057220459, "reward_std": 0.6741225123405457, "rewards/_accuracy_reward": 0.2212499976158142, "rewards/_format_reward": 0.75, "step": 241 }, { "completion_length": 192.125, "epoch": 0.0605, "grad_norm": 0.4726582169532776, "kl": 0.014720995910465717, "learning_rate": 3.0250000000000003e-06, "loss": 0.0006, "reward": 0.90625, "reward_std": 0.6483151316642761, "rewards/_accuracy_reward": 0.1562499850988388, "rewards/_format_reward": 0.75, "step": 242 }, { "completion_length": 207.625, "epoch": 0.06075, "grad_norm": 0.4914742708206177, "kl": 0.019970744848251343, "learning_rate": 3.0375000000000006e-06, "loss": 0.0008, "reward": 1.15625, "reward_std": 0.9904679656028748, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 0.625, "step": 243 }, { "completion_length": 153.625, "epoch": 0.061, "grad_norm": 0.4838781952857971, "kl": 0.015723643824458122, "learning_rate": 3.05e-06, "loss": 0.0006, "reward": 1.375, "reward_std": 0.6681531071662903, "rewards/_accuracy_reward": 0.5, "rewards/_format_reward": 0.875, "step": 244 }, { "completion_length": 202.375, "epoch": 0.06125, "grad_norm": 0.47575071454048157, "kl": 0.01339123584330082, "learning_rate": 3.0625000000000003e-06, "loss": 0.0005, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 245 }, { "completion_length": 190.375, "epoch": 0.0615, "grad_norm": 0.4257509410381317, "kl": 0.01585126481950283, "learning_rate": 3.075e-06, "loss": 0.0006, "reward": 1.193750023841858, "reward_std": 0.8304204940795898, "rewards/_accuracy_reward": 0.4437499940395355, "rewards/_format_reward": 0.75, "step": 246 }, { "completion_length": 168.125, "epoch": 0.06175, "grad_norm": 0.4466553330421448, "kl": 0.015615028329193592, "learning_rate": 3.0875000000000005e-06, "loss": 0.0006, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 247 }, { "completion_length": 190.5, "epoch": 0.062, "grad_norm": 0.5046932101249695, "kl": 0.020424310117959976, "learning_rate": 3.1000000000000004e-06, "loss": 0.0008, "reward": 0.8062499761581421, "reward_std": 0.7242915630340576, "rewards/_accuracy_reward": 0.18125000596046448, "rewards/_format_reward": 0.625, "step": 248 }, { "completion_length": 115.125, "epoch": 0.06225, "grad_norm": 0.9507879614830017, "kl": 0.027173198759555817, "learning_rate": 3.1125000000000007e-06, "loss": 0.0011, "reward": 1.4075000286102295, "reward_std": 0.3749571740627289, "rewards/_accuracy_reward": 0.4074999988079071, "rewards/_format_reward": 1.0, "step": 249 }, { "completion_length": 226.25, "epoch": 0.0625, "grad_norm": 0.4514904022216797, "kl": 0.014395845122635365, "learning_rate": 3.125e-06, "loss": 0.0006, "reward": 1.15625, "reward_std": 0.9904679656028748, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 0.625, "step": 250 }, { "completion_length": 183.0, "epoch": 0.06275, "grad_norm": 0.482563853263855, "kl": 0.015461320988833904, "learning_rate": 3.1375e-06, "loss": 0.0006, "reward": 1.46875, "reward_std": 0.6999680995941162, "rewards/_accuracy_reward": 0.59375, "rewards/_format_reward": 0.875, "step": 251 }, { "completion_length": 210.875, "epoch": 0.063, "grad_norm": 0.44888806343078613, "kl": 0.013685889542102814, "learning_rate": 3.1500000000000003e-06, "loss": 0.0005, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.875, "step": 252 }, { "completion_length": 183.625, "epoch": 0.06325, "grad_norm": 0.47426795959472656, "kl": 0.01469960156828165, "learning_rate": 3.1625000000000002e-06, "loss": 0.0006, "reward": 1.381250023841858, "reward_std": 0.9133679866790771, "rewards/_accuracy_reward": 0.6312500238418579, "rewards/_format_reward": 0.75, "step": 253 }, { "completion_length": 123.25, "epoch": 0.0635, "grad_norm": 0.6966858506202698, "kl": 0.02002432383596897, "learning_rate": 3.175e-06, "loss": 0.0008, "reward": 1.5625, "reward_std": 0.6648039817810059, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.75, "step": 254 }, { "completion_length": 194.25, "epoch": 0.06375, "grad_norm": 0.5240160226821899, "kl": 0.025446368381381035, "learning_rate": 3.1875e-06, "loss": 0.001, "reward": 1.100000023841858, "reward_std": 0.7662525177001953, "rewards/_accuracy_reward": 0.3499999940395355, "rewards/_format_reward": 0.75, "step": 255 }, { "completion_length": 232.125, "epoch": 0.064, "grad_norm": 0.46555182337760925, "kl": 0.013254818506538868, "learning_rate": 3.2000000000000003e-06, "loss": 0.0005, "reward": 0.39374998211860657, "reward_std": 0.5434265732765198, "rewards/_accuracy_reward": 0.01875000074505806, "rewards/_format_reward": 0.375, "step": 256 }, { "completion_length": 209.375, "epoch": 0.06425, "grad_norm": 0.4615981876850128, "kl": 0.01638483628630638, "learning_rate": 3.2125e-06, "loss": 0.0007, "reward": 1.15625, "reward_std": 0.9904679656028748, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 0.625, "step": 257 }, { "completion_length": 176.0, "epoch": 0.0645, "grad_norm": 0.47257405519485474, "kl": 0.01924911141395569, "learning_rate": 3.2250000000000005e-06, "loss": 0.0008, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 258 }, { "completion_length": 188.25, "epoch": 0.06475, "grad_norm": 0.538159191608429, "kl": 0.02660544216632843, "learning_rate": 3.2375e-06, "loss": 0.0011, "reward": 1.193750023841858, "reward_std": 0.8304204940795898, "rewards/_accuracy_reward": 0.4437499940395355, "rewards/_format_reward": 0.75, "step": 259 }, { "completion_length": 197.375, "epoch": 0.065, "grad_norm": 0.46662867069244385, "kl": 0.021138174459338188, "learning_rate": 3.2500000000000002e-06, "loss": 0.0008, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 260 }, { "completion_length": 182.25, "epoch": 0.06525, "grad_norm": 0.4960637390613556, "kl": 0.023767707869410515, "learning_rate": 3.2625e-06, "loss": 0.001, "reward": 0.8624999523162842, "reward_std": 0.5403371453285217, "rewards/_accuracy_reward": 0.11250000447034836, "rewards/_format_reward": 0.75, "step": 261 }, { "completion_length": 100.0, "epoch": 0.0655, "grad_norm": 0.8033775687217712, "kl": 0.021194277331233025, "learning_rate": 3.2750000000000004e-06, "loss": 0.0008, "reward": 1.6687500476837158, "reward_std": 0.4613160789012909, "rewards/_accuracy_reward": 0.668749988079071, "rewards/_format_reward": 1.0, "step": 262 }, { "completion_length": 147.375, "epoch": 0.06575, "grad_norm": 0.5282887816429138, "kl": 0.02339054085314274, "learning_rate": 3.2875000000000003e-06, "loss": 0.0009, "reward": 1.506250023841858, "reward_std": 0.4144165515899658, "rewards/_accuracy_reward": 0.5062500238418579, "rewards/_format_reward": 1.0, "step": 263 }, { "completion_length": 190.25, "epoch": 0.066, "grad_norm": 0.5130710005760193, "kl": 0.020366905257105827, "learning_rate": 3.3000000000000006e-06, "loss": 0.0008, "reward": 1.46875, "reward_std": 0.6999680995941162, "rewards/_accuracy_reward": 0.59375, "rewards/_format_reward": 0.875, "step": 264 }, { "completion_length": 177.0, "epoch": 0.06625, "grad_norm": 0.6445785760879517, "kl": 0.02009623870253563, "learning_rate": 3.3125e-06, "loss": 0.0008, "reward": 1.162500023841858, "reward_std": 0.8327021598815918, "rewards/_accuracy_reward": 0.5375000238418579, "rewards/_format_reward": 0.625, "step": 265 }, { "completion_length": 188.875, "epoch": 0.0665, "grad_norm": 0.4418463706970215, "kl": 0.016711309552192688, "learning_rate": 3.3250000000000004e-06, "loss": 0.0007, "reward": 0.90625, "reward_std": 0.6483150720596313, "rewards/_accuracy_reward": 0.1562499850988388, "rewards/_format_reward": 0.75, "step": 266 }, { "completion_length": 132.625, "epoch": 0.06675, "grad_norm": 0.6308992505073547, "kl": 0.02185102552175522, "learning_rate": 3.3375000000000002e-06, "loss": 0.0009, "reward": 1.5499999523162842, "reward_std": 0.4855041801929474, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 267 }, { "completion_length": 187.75, "epoch": 0.067, "grad_norm": 0.5287885665893555, "kl": 0.028705699369311333, "learning_rate": 3.3500000000000005e-06, "loss": 0.0011, "reward": 1.6375000476837158, "reward_std": 0.7224709987640381, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 0.875, "step": 268 }, { "completion_length": 145.25, "epoch": 0.06725, "grad_norm": 0.6234681606292725, "kl": 0.027406934648752213, "learning_rate": 3.3625000000000004e-06, "loss": 0.0011, "reward": 1.3125, "reward_std": 0.5469068884849548, "rewards/_accuracy_reward": 0.4375, "rewards/_format_reward": 0.875, "step": 269 }, { "completion_length": 77.125, "epoch": 0.0675, "grad_norm": 0.047009434551000595, "kl": 0.03576899319887161, "learning_rate": 3.3750000000000003e-06, "loss": 0.0014, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 270 }, { "completion_length": 128.125, "epoch": 0.06775, "grad_norm": 0.6146723031997681, "kl": 0.036112938076257706, "learning_rate": 3.3875e-06, "loss": 0.0014, "reward": 1.28125, "reward_std": 0.6187184453010559, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.875, "step": 271 }, { "completion_length": 151.375, "epoch": 0.068, "grad_norm": 0.5604023337364197, "kl": 0.044364482164382935, "learning_rate": 3.4000000000000005e-06, "loss": 0.0018, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 272 }, { "completion_length": 203.375, "epoch": 0.06825, "grad_norm": 0.560128390789032, "kl": 0.018296649679541588, "learning_rate": 3.4125000000000004e-06, "loss": 0.0007, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 273 }, { "completion_length": 186.0, "epoch": 0.0685, "grad_norm": 0.6812446117401123, "kl": 0.03642842918634415, "learning_rate": 3.4250000000000007e-06, "loss": 0.0015, "reward": 0.7699999809265137, "reward_std": 0.7155816555023193, "rewards/_accuracy_reward": 0.14499999582767487, "rewards/_format_reward": 0.625, "step": 274 }, { "completion_length": 199.25, "epoch": 0.06875, "grad_norm": 0.5815830230712891, "kl": 0.036607492715120316, "learning_rate": 3.4375e-06, "loss": 0.0015, "reward": 1.0625, "reward_std": 0.933025062084198, "rewards/_accuracy_reward": 0.4375, "rewards/_format_reward": 0.625, "step": 275 }, { "completion_length": 185.0, "epoch": 0.069, "grad_norm": 1.3024146556854248, "kl": 0.0446808896958828, "learning_rate": 3.45e-06, "loss": 0.0018, "reward": 1.21875, "reward_std": 0.8284828662872314, "rewards/_accuracy_reward": 0.46875, "rewards/_format_reward": 0.75, "step": 276 }, { "completion_length": 131.875, "epoch": 0.06925, "grad_norm": 0.6265179514884949, "kl": 0.03437270596623421, "learning_rate": 3.4625000000000003e-06, "loss": 0.0014, "reward": 1.59375, "reward_std": 0.4419417381286621, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 0.875, "step": 277 }, { "completion_length": 122.0, "epoch": 0.0695, "grad_norm": 0.6202266812324524, "kl": 0.04929348826408386, "learning_rate": 3.475e-06, "loss": 0.002, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 278 }, { "completion_length": 116.5, "epoch": 0.06975, "grad_norm": 0.8051105737686157, "kl": 0.049348097294569016, "learning_rate": 3.4875000000000005e-06, "loss": 0.002, "reward": 1.5750000476837158, "reward_std": 0.4605897068977356, "rewards/_accuracy_reward": 0.574999988079071, "rewards/_format_reward": 1.0, "step": 279 }, { "completion_length": 202.5, "epoch": 0.07, "grad_norm": 0.5659109950065613, "kl": 0.04884558916091919, "learning_rate": 3.5e-06, "loss": 0.002, "reward": 1.125, "reward_std": 0.7676494717597961, "rewards/_accuracy_reward": 0.375, "rewards/_format_reward": 0.75, "step": 280 }, { "completion_length": 81.0, "epoch": 0.07025, "grad_norm": 1.043556571006775, "kl": 0.043515197932720184, "learning_rate": 3.5125000000000003e-06, "loss": 0.0017, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 281 }, { "completion_length": 187.25, "epoch": 0.0705, "grad_norm": 1.5090237855911255, "kl": 0.07255250960588455, "learning_rate": 3.525e-06, "loss": 0.0029, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.625, "step": 282 }, { "completion_length": 111.375, "epoch": 0.07075, "grad_norm": 0.7345578670501709, "kl": 0.04479851573705673, "learning_rate": 3.5375000000000004e-06, "loss": 0.0018, "reward": 1.5750000476837158, "reward_std": 0.4605897068977356, "rewards/_accuracy_reward": 0.574999988079071, "rewards/_format_reward": 1.0, "step": 283 }, { "completion_length": 66.25, "epoch": 0.071, "grad_norm": 1.220955491065979, "kl": 0.05355329439043999, "learning_rate": 3.5500000000000003e-06, "loss": 0.0021, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 284 }, { "completion_length": 124.375, "epoch": 0.07125, "grad_norm": 0.8568919897079468, "kl": 0.040953654795885086, "learning_rate": 3.5625e-06, "loss": 0.0016, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 285 }, { "completion_length": 141.25, "epoch": 0.0715, "grad_norm": 0.030418027192354202, "kl": 0.03461394086480141, "learning_rate": 3.575e-06, "loss": 0.0014, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 286 }, { "completion_length": 107.5, "epoch": 0.07175, "grad_norm": 0.7467533349990845, "kl": 0.05478259548544884, "learning_rate": 3.5875000000000004e-06, "loss": 0.0022, "reward": 1.4262499809265137, "reward_std": 0.48056328296661377, "rewards/_accuracy_reward": 0.42624998092651367, "rewards/_format_reward": 1.0, "step": 287 }, { "completion_length": 151.25, "epoch": 0.072, "grad_norm": 0.6246471405029297, "kl": 0.036742065101861954, "learning_rate": 3.6000000000000003e-06, "loss": 0.0015, "reward": 1.2625000476837158, "reward_std": 0.8826704621315002, "rewards/_accuracy_reward": 0.512499988079071, "rewards/_format_reward": 0.75, "step": 288 }, { "completion_length": 126.625, "epoch": 0.07225, "grad_norm": 0.7587897777557373, "kl": 0.06992122530937195, "learning_rate": 3.6125000000000006e-06, "loss": 0.0028, "reward": 1.59375, "reward_std": 0.4419417381286621, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 0.875, "step": 289 }, { "completion_length": 170.375, "epoch": 0.0725, "grad_norm": 0.5562832951545715, "kl": 0.053098414093256, "learning_rate": 3.625e-06, "loss": 0.0021, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.875, "step": 290 }, { "completion_length": 54.625, "epoch": 0.07275, "grad_norm": 1.5443010330200195, "kl": 0.07868895679712296, "learning_rate": 3.6375000000000003e-06, "loss": 0.0031, "reward": 1.318750023841858, "reward_std": 0.28402402997016907, "rewards/_accuracy_reward": 0.3187499940395355, "rewards/_format_reward": 1.0, "step": 291 }, { "completion_length": 196.25, "epoch": 0.073, "grad_norm": 0.6690995097160339, "kl": 0.029539842158555984, "learning_rate": 3.65e-06, "loss": 0.0012, "reward": 0.668749988079071, "reward_std": 0.7736451625823975, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 0.5, "step": 292 }, { "completion_length": 174.125, "epoch": 0.07325, "grad_norm": 0.6143000721931458, "kl": 0.05302724614739418, "learning_rate": 3.6625000000000005e-06, "loss": 0.0021, "reward": 1.4375, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.75, "step": 293 }, { "completion_length": 118.125, "epoch": 0.0735, "grad_norm": 0.7825801372528076, "kl": 0.06216191127896309, "learning_rate": 3.6750000000000004e-06, "loss": 0.0025, "reward": 1.600000023841858, "reward_std": 0.43260011076927185, "rewards/_accuracy_reward": 0.6000000238418579, "rewards/_format_reward": 1.0, "step": 294 }, { "completion_length": 181.25, "epoch": 0.07375, "grad_norm": 0.6875461935997009, "kl": 0.03431488573551178, "learning_rate": 3.6875000000000007e-06, "loss": 0.0014, "reward": 1.46875, "reward_std": 0.6999680995941162, "rewards/_accuracy_reward": 0.59375, "rewards/_format_reward": 0.875, "step": 295 }, { "completion_length": 99.75, "epoch": 0.074, "grad_norm": 0.972676157951355, "kl": 0.05988858640193939, "learning_rate": 3.7e-06, "loss": 0.0024, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 296 }, { "completion_length": 120.5, "epoch": 0.07425, "grad_norm": 0.6937150955200195, "kl": 0.06560764461755753, "learning_rate": 3.7125000000000005e-06, "loss": 0.0026, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 297 }, { "completion_length": 110.0, "epoch": 0.0745, "grad_norm": 1.0664499998092651, "kl": 0.08323174715042114, "learning_rate": 3.7250000000000003e-06, "loss": 0.0033, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 298 }, { "completion_length": 64.75, "epoch": 0.07475, "grad_norm": 0.9208748936653137, "kl": 0.051380615681409836, "learning_rate": 3.7375000000000006e-06, "loss": 0.0021, "reward": 1.4812500476837158, "reward_std": 0.4374745190143585, "rewards/_accuracy_reward": 0.48124998807907104, "rewards/_format_reward": 1.0, "step": 299 }, { "completion_length": 148.0, "epoch": 0.075, "grad_norm": 0.6525527238845825, "kl": 0.05159568786621094, "learning_rate": 3.7500000000000005e-06, "loss": 0.0021, "reward": 1.087499976158142, "reward_std": 0.5442885160446167, "rewards/_accuracy_reward": 0.2124999761581421, "rewards/_format_reward": 0.875, "step": 300 }, { "completion_length": 167.25, "epoch": 0.07525, "grad_norm": 0.7984333038330078, "kl": 0.06883440166711807, "learning_rate": 3.7625e-06, "loss": 0.0028, "reward": 1.6875, "reward_std": 0.6373774409294128, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.875, "step": 301 }, { "completion_length": 104.625, "epoch": 0.0755, "grad_norm": 0.7018251419067383, "kl": 0.04696748033165932, "learning_rate": 3.7750000000000003e-06, "loss": 0.0019, "reward": 1.3624999523162842, "reward_std": 0.404218852519989, "rewards/_accuracy_reward": 0.36249998211860657, "rewards/_format_reward": 1.0, "step": 302 }, { "completion_length": 148.375, "epoch": 0.07575, "grad_norm": 0.07361488789319992, "kl": 0.08332055807113647, "learning_rate": 3.7875e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 303 }, { "completion_length": 132.625, "epoch": 0.076, "grad_norm": 0.7021421194076538, "kl": 0.05917055159807205, "learning_rate": 3.8000000000000005e-06, "loss": 0.0024, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 304 }, { "completion_length": 175.5, "epoch": 0.07625, "grad_norm": 0.6244091391563416, "kl": 0.06270697712898254, "learning_rate": 3.8125e-06, "loss": 0.0025, "reward": 1.2687499523162842, "reward_std": 0.699968159198761, "rewards/_accuracy_reward": 0.518750011920929, "rewards/_format_reward": 0.75, "step": 305 }, { "completion_length": 98.25, "epoch": 0.0765, "grad_norm": 1.3864850997924805, "kl": 0.078142911195755, "learning_rate": 3.825000000000001e-06, "loss": 0.0031, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.28749996423721313, "rewards/_format_reward": 1.0, "step": 306 }, { "completion_length": 134.5, "epoch": 0.07675, "grad_norm": 0.5813739895820618, "kl": 0.041886672377586365, "learning_rate": 3.8375e-06, "loss": 0.0017, "reward": 1.2312500476837158, "reward_std": 0.6284547448158264, "rewards/_accuracy_reward": 0.35624998807907104, "rewards/_format_reward": 0.875, "step": 307 }, { "completion_length": 168.5, "epoch": 0.077, "grad_norm": 0.5561500191688538, "kl": 0.06397830694913864, "learning_rate": 3.85e-06, "loss": 0.0026, "reward": 1.3250000476837158, "reward_std": 0.6850443482398987, "rewards/_accuracy_reward": 0.44999998807907104, "rewards/_format_reward": 0.875, "step": 308 }, { "completion_length": 126.125, "epoch": 0.07725, "grad_norm": 0.6728265881538391, "kl": 0.057980045676231384, "learning_rate": 3.8625e-06, "loss": 0.0023, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 309 }, { "completion_length": 105.25, "epoch": 0.0775, "grad_norm": 0.9413458108901978, "kl": 0.10995390266180038, "learning_rate": 3.875e-06, "loss": 0.0044, "reward": 1.5950000286102295, "reward_std": 0.4400324523448944, "rewards/_accuracy_reward": 0.5950000286102295, "rewards/_format_reward": 1.0, "step": 310 }, { "completion_length": 108.5, "epoch": 0.07775, "grad_norm": 0.742570698261261, "kl": 0.06104608625173569, "learning_rate": 3.8875000000000005e-06, "loss": 0.0024, "reward": 1.7575000524520874, "reward_std": 0.449150025844574, "rewards/_accuracy_reward": 0.7575000524520874, "rewards/_format_reward": 1.0, "step": 311 }, { "completion_length": 110.75, "epoch": 0.078, "grad_norm": 1.0395474433898926, "kl": 0.088965505361557, "learning_rate": 3.900000000000001e-06, "loss": 0.0036, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.643750011920929, "rewards/_format_reward": 1.0, "step": 312 }, { "completion_length": 144.875, "epoch": 0.07825, "grad_norm": 0.6619213223457336, "kl": 0.06248822063207626, "learning_rate": 3.9125e-06, "loss": 0.0025, "reward": 1.0625, "reward_std": 0.5403372049331665, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.875, "step": 313 }, { "completion_length": 109.0, "epoch": 0.0785, "grad_norm": 0.7599800229072571, "kl": 0.0571325309574604, "learning_rate": 3.9250000000000005e-06, "loss": 0.0023, "reward": 1.4075000286102295, "reward_std": 0.3749571442604065, "rewards/_accuracy_reward": 0.4074999988079071, "rewards/_format_reward": 1.0, "step": 314 }, { "completion_length": 73.0, "epoch": 0.07875, "grad_norm": 0.9942233562469482, "kl": 0.05285262688994408, "learning_rate": 3.9375e-06, "loss": 0.0021, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 315 }, { "completion_length": 163.0, "epoch": 0.079, "grad_norm": 0.7015470266342163, "kl": 0.06908886879682541, "learning_rate": 3.95e-06, "loss": 0.0028, "reward": 1.3250000476837158, "reward_std": 0.6850443482398987, "rewards/_accuracy_reward": 0.44999998807907104, "rewards/_format_reward": 0.875, "step": 316 }, { "completion_length": 72.875, "epoch": 0.07925, "grad_norm": 1.0622810125350952, "kl": 0.10289790481328964, "learning_rate": 3.962500000000001e-06, "loss": 0.0041, "reward": 1.658750057220459, "reward_std": 0.47675803303718567, "rewards/_accuracy_reward": 0.6587499976158142, "rewards/_format_reward": 1.0, "step": 317 }, { "completion_length": 72.0, "epoch": 0.0795, "grad_norm": 0.8196299076080322, "kl": 0.05889907479286194, "learning_rate": 3.975000000000001e-06, "loss": 0.0024, "reward": 1.412500023841858, "reward_std": 0.3691205680370331, "rewards/_accuracy_reward": 0.4124999940395355, "rewards/_format_reward": 1.0, "step": 318 }, { "completion_length": 106.25, "epoch": 0.07975, "grad_norm": 0.8157685399055481, "kl": 0.06252449005842209, "learning_rate": 3.9875e-06, "loss": 0.0025, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 319 }, { "completion_length": 101.25, "epoch": 0.08, "grad_norm": 0.7154927253723145, "kl": 0.05639846250414848, "learning_rate": 4.000000000000001e-06, "loss": 0.0023, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 320 }, { "completion_length": 118.75, "epoch": 0.08025, "grad_norm": 0.950957179069519, "kl": 0.11179396510124207, "learning_rate": 4.0125e-06, "loss": 0.0045, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 321 }, { "completion_length": 111.125, "epoch": 0.0805, "grad_norm": 0.7619085907936096, "kl": 0.060069490224123, "learning_rate": 4.0250000000000004e-06, "loss": 0.0024, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 322 }, { "completion_length": 128.5, "epoch": 0.08075, "grad_norm": 0.9389228224754333, "kl": 0.0730435773730278, "learning_rate": 4.037500000000001e-06, "loss": 0.0029, "reward": 1.1687499284744263, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 1.0, "step": 323 }, { "completion_length": 158.5, "epoch": 0.081, "grad_norm": 0.846824049949646, "kl": 0.08881374448537827, "learning_rate": 4.05e-06, "loss": 0.0036, "reward": 1.1875, "reward_std": 0.9519716501235962, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.625, "step": 324 }, { "completion_length": 155.375, "epoch": 0.08125, "grad_norm": 0.58637934923172, "kl": 0.08695843815803528, "learning_rate": 4.0625000000000005e-06, "loss": 0.0035, "reward": 0.9187499284744263, "reward_std": 0.3514637351036072, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 0.75, "step": 325 }, { "completion_length": 84.625, "epoch": 0.0815, "grad_norm": 1.0473264455795288, "kl": 0.06300223618745804, "learning_rate": 4.075e-06, "loss": 0.0025, "reward": 1.5499999523162842, "reward_std": 0.4855042099952698, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 326 }, { "completion_length": 139.75, "epoch": 0.08175, "grad_norm": 0.7010840177536011, "kl": 0.05449886992573738, "learning_rate": 4.0875e-06, "loss": 0.0022, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 327 }, { "completion_length": 97.5, "epoch": 0.082, "grad_norm": 0.8196879029273987, "kl": 0.09262411296367645, "learning_rate": 4.1e-06, "loss": 0.0037, "reward": 1.5499999523162842, "reward_std": 0.4855042099952698, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 328 }, { "completion_length": 124.25, "epoch": 0.08225, "grad_norm": 1.0033892393112183, "kl": 0.10067097842693329, "learning_rate": 4.1125e-06, "loss": 0.004, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 329 }, { "completion_length": 155.125, "epoch": 0.0825, "grad_norm": 0.6117260456085205, "kl": 0.0802069678902626, "learning_rate": 4.125e-06, "loss": 0.0032, "reward": 1.1687499284744263, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 1.0, "step": 330 }, { "completion_length": 90.625, "epoch": 0.08275, "grad_norm": 1.334694743156433, "kl": 0.11455602198839188, "learning_rate": 4.137500000000001e-06, "loss": 0.0046, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 331 }, { "completion_length": 76.5, "epoch": 0.083, "grad_norm": 1.0724419355392456, "kl": 0.09013547003269196, "learning_rate": 4.15e-06, "loss": 0.0036, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 332 }, { "completion_length": 102.875, "epoch": 0.08325, "grad_norm": 1.0087305307388306, "kl": 0.0930456668138504, "learning_rate": 4.1625e-06, "loss": 0.0037, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 333 }, { "completion_length": 137.875, "epoch": 0.0835, "grad_norm": 0.6068035960197449, "kl": 0.0930265486240387, "learning_rate": 4.175e-06, "loss": 0.0037, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 334 }, { "completion_length": 83.375, "epoch": 0.08375, "grad_norm": 1.1466654539108276, "kl": 0.09693353623151779, "learning_rate": 4.1875e-06, "loss": 0.0039, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 335 }, { "completion_length": 135.125, "epoch": 0.084, "grad_norm": 1.0346399545669556, "kl": 0.07693766057491302, "learning_rate": 4.2000000000000004e-06, "loss": 0.0031, "reward": 1.5700000524520874, "reward_std": 0.46727174520492554, "rewards/_accuracy_reward": 0.5699999928474426, "rewards/_format_reward": 1.0, "step": 336 }, { "completion_length": 127.375, "epoch": 0.08425, "grad_norm": 0.7949957847595215, "kl": 0.16617698967456818, "learning_rate": 4.212500000000001e-06, "loss": 0.0066, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 337 }, { "completion_length": 98.25, "epoch": 0.0845, "grad_norm": 0.9801748394966125, "kl": 0.09285181015729904, "learning_rate": 4.225e-06, "loss": 0.0037, "reward": 1.5499999523162842, "reward_std": 0.4855041801929474, "rewards/_accuracy_reward": 0.550000011920929, "rewards/_format_reward": 1.0, "step": 338 }, { "completion_length": 129.0, "epoch": 0.08475, "grad_norm": 1.0321601629257202, "kl": 0.08525305241346359, "learning_rate": 4.2375000000000005e-06, "loss": 0.0034, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 339 }, { "completion_length": 117.0, "epoch": 0.085, "grad_norm": 1.1353120803833008, "kl": 0.10381244868040085, "learning_rate": 4.25e-06, "loss": 0.0042, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 340 }, { "completion_length": 67.5, "epoch": 0.08525, "grad_norm": 0.987204372882843, "kl": 0.1275581568479538, "learning_rate": 4.2625e-06, "loss": 0.0051, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 341 }, { "completion_length": 88.25, "epoch": 0.0855, "grad_norm": 1.1097989082336426, "kl": 0.10488829016685486, "learning_rate": 4.2750000000000006e-06, "loss": 0.0042, "reward": 1.7575000524520874, "reward_std": 0.449150025844574, "rewards/_accuracy_reward": 0.7574999928474426, "rewards/_format_reward": 1.0, "step": 342 }, { "completion_length": 101.625, "epoch": 0.08575, "grad_norm": 1.0648112297058105, "kl": 0.11403176933526993, "learning_rate": 4.287500000000001e-06, "loss": 0.0046, "reward": 1.3875000476837158, "reward_std": 0.3879893124103546, "rewards/_accuracy_reward": 0.38749998807907104, "rewards/_format_reward": 1.0, "step": 343 }, { "completion_length": 98.875, "epoch": 0.086, "grad_norm": 0.8387040495872498, "kl": 0.11271817982196808, "learning_rate": 4.3e-06, "loss": 0.0045, "reward": 1.756250023841858, "reward_std": 0.6894291639328003, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 344 }, { "completion_length": 128.125, "epoch": 0.08625, "grad_norm": 0.7294446229934692, "kl": 0.12787966430187225, "learning_rate": 4.312500000000001e-06, "loss": 0.0051, "reward": 1.1762499809265137, "reward_std": 0.6348889470100403, "rewards/_accuracy_reward": 0.30125001072883606, "rewards/_format_reward": 0.875, "step": 345 }, { "completion_length": 159.25, "epoch": 0.0865, "grad_norm": 0.8217050433158875, "kl": 0.14619140326976776, "learning_rate": 4.325e-06, "loss": 0.0058, "reward": 1.162500023841858, "reward_std": 0.832702100276947, "rewards/_accuracy_reward": 0.5375000238418579, "rewards/_format_reward": 0.625, "step": 346 }, { "completion_length": 129.25, "epoch": 0.08675, "grad_norm": 0.9098784923553467, "kl": 0.13450828194618225, "learning_rate": 4.3375e-06, "loss": 0.0054, "reward": 1.431249976158142, "reward_std": 0.47579824924468994, "rewards/_accuracy_reward": 0.4312499761581421, "rewards/_format_reward": 1.0, "step": 347 }, { "completion_length": 151.0, "epoch": 0.087, "grad_norm": 0.7144613265991211, "kl": 0.1879410594701767, "learning_rate": 4.350000000000001e-06, "loss": 0.0075, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.875, "step": 348 }, { "completion_length": 84.375, "epoch": 0.08725, "grad_norm": 1.6553195714950562, "kl": 0.10268331319093704, "learning_rate": 4.362500000000001e-06, "loss": 0.0041, "reward": 1.375, "reward_std": 0.6681531071662903, "rewards/_accuracy_reward": 0.5, "rewards/_format_reward": 0.875, "step": 349 }, { "completion_length": 61.25, "epoch": 0.0875, "grad_norm": 1.4001189470291138, "kl": 0.18523964285850525, "learning_rate": 4.3750000000000005e-06, "loss": 0.0074, "reward": 1.5012500286102295, "reward_std": 0.42089828848838806, "rewards/_accuracy_reward": 0.5012500286102295, "rewards/_format_reward": 1.0, "step": 350 }, { "completion_length": 139.375, "epoch": 0.08775, "grad_norm": 0.5610023140907288, "kl": 0.09610922634601593, "learning_rate": 4.3875e-06, "loss": 0.0038, "reward": 1.375, "reward_std": 0.6681531071662903, "rewards/_accuracy_reward": 0.5, "rewards/_format_reward": 0.875, "step": 351 }, { "completion_length": 167.5, "epoch": 0.088, "grad_norm": 0.8529553413391113, "kl": 0.1177278384566307, "learning_rate": 4.4e-06, "loss": 0.0047, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.2812499701976776, "rewards/_format_reward": 0.875, "step": 352 }, { "completion_length": 134.75, "epoch": 0.08825, "grad_norm": 0.7346218824386597, "kl": 0.12559470534324646, "learning_rate": 4.4125000000000005e-06, "loss": 0.005, "reward": 1.274999976158142, "reward_std": 0.6974443197250366, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.875, "step": 353 }, { "completion_length": 131.375, "epoch": 0.0885, "grad_norm": 0.7096778154373169, "kl": 0.15666167438030243, "learning_rate": 4.425e-06, "loss": 0.0063, "reward": 1.6875, "reward_std": 0.6373774409294128, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.875, "step": 354 }, { "completion_length": 129.5, "epoch": 0.08875, "grad_norm": 0.7917588949203491, "kl": 0.16912636160850525, "learning_rate": 4.4375e-06, "loss": 0.0068, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 355 }, { "completion_length": 108.75, "epoch": 0.089, "grad_norm": 1.4511607885360718, "kl": 0.1251247525215149, "learning_rate": 4.450000000000001e-06, "loss": 0.005, "reward": 1.5012500286102295, "reward_std": 0.42089828848838806, "rewards/_accuracy_reward": 0.5012500286102295, "rewards/_format_reward": 1.0, "step": 356 }, { "completion_length": 114.25, "epoch": 0.08925, "grad_norm": 0.9404997229576111, "kl": 0.09536008536815643, "learning_rate": 4.4625e-06, "loss": 0.0038, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 357 }, { "completion_length": 101.25, "epoch": 0.0895, "grad_norm": 0.7459734082221985, "kl": 0.1477159559726715, "learning_rate": 4.475e-06, "loss": 0.0059, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 358 }, { "completion_length": 85.625, "epoch": 0.08975, "grad_norm": 1.143880009651184, "kl": 0.09666404128074646, "learning_rate": 4.4875e-06, "loss": 0.0039, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 359 }, { "completion_length": 204.375, "epoch": 0.09, "grad_norm": 0.9317293167114258, "kl": 0.31011974811553955, "learning_rate": 4.5e-06, "loss": 0.0124, "reward": 0.762499988079071, "reward_std": 0.8907260298728943, "rewards/_accuracy_reward": 0.26249998807907104, "rewards/_format_reward": 0.5, "step": 360 }, { "completion_length": 164.0, "epoch": 0.09025, "grad_norm": 0.8388033509254456, "kl": 0.1918019950389862, "learning_rate": 4.5125e-06, "loss": 0.0077, "reward": 1.0512499809265137, "reward_std": 0.7511028051376343, "rewards/_accuracy_reward": 0.30124998092651367, "rewards/_format_reward": 0.75, "step": 361 }, { "completion_length": 73.125, "epoch": 0.0905, "grad_norm": 1.0067369937896729, "kl": 0.09683433175086975, "learning_rate": 4.525000000000001e-06, "loss": 0.0039, "reward": 1.243749976158142, "reward_std": 0.32120034098625183, "rewards/_accuracy_reward": 0.24374999105930328, "rewards/_format_reward": 1.0, "step": 362 }, { "completion_length": 175.625, "epoch": 0.09075, "grad_norm": 0.79710453748703, "kl": 0.15141978859901428, "learning_rate": 4.5375e-06, "loss": 0.0061, "reward": 1.03125, "reward_std": 0.9300297498703003, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 0.5, "step": 363 }, { "completion_length": 106.25, "epoch": 0.091, "grad_norm": 0.7696452736854553, "kl": 0.13706472516059875, "learning_rate": 4.5500000000000005e-06, "loss": 0.0055, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 364 }, { "completion_length": 102.375, "epoch": 0.09125, "grad_norm": 0.6263626217842102, "kl": 0.18292012810707092, "learning_rate": 4.5625e-06, "loss": 0.0073, "reward": 1.09375, "reward_std": 0.4419417381286621, "rewards/_accuracy_reward": 0.21875, "rewards/_format_reward": 0.875, "step": 365 }, { "completion_length": 123.75, "epoch": 0.0915, "grad_norm": 0.6248298287391663, "kl": 0.0651947632431984, "learning_rate": 4.575e-06, "loss": 0.0026, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 366 }, { "completion_length": 108.5, "epoch": 0.09175, "grad_norm": 1.123427391052246, "kl": 0.1323918253183365, "learning_rate": 4.5875000000000005e-06, "loss": 0.0053, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 367 }, { "completion_length": 159.5, "epoch": 0.092, "grad_norm": 0.7406381368637085, "kl": 0.1296067088842392, "learning_rate": 4.600000000000001e-06, "loss": 0.0052, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.643750011920929, "rewards/_format_reward": 1.0, "step": 368 }, { "completion_length": 164.75, "epoch": 0.09225, "grad_norm": 0.717776358127594, "kl": 0.1630459874868393, "learning_rate": 4.6125e-06, "loss": 0.0065, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 369 }, { "completion_length": 86.25, "epoch": 0.0925, "grad_norm": 1.1500803232192993, "kl": 0.12640973925590515, "learning_rate": 4.625000000000001e-06, "loss": 0.0051, "reward": 1.756250023841858, "reward_std": 0.45153507590293884, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 370 }, { "completion_length": 161.375, "epoch": 0.09275, "grad_norm": 0.8110669255256653, "kl": 0.13621176779270172, "learning_rate": 4.6375e-06, "loss": 0.0054, "reward": 1.1437499523162842, "reward_std": 0.8317097425460815, "rewards/_accuracy_reward": 0.39374998211860657, "rewards/_format_reward": 0.75, "step": 371 }, { "completion_length": 175.375, "epoch": 0.093, "grad_norm": 0.7024534344673157, "kl": 0.1695256382226944, "learning_rate": 4.65e-06, "loss": 0.0068, "reward": 1.1875, "reward_std": 0.9519716501235962, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.625, "step": 372 }, { "completion_length": 145.125, "epoch": 0.09325, "grad_norm": 1.101328730583191, "kl": 0.17406335473060608, "learning_rate": 4.662500000000001e-06, "loss": 0.007, "reward": 1.381250023841858, "reward_std": 0.9133679866790771, "rewards/_accuracy_reward": 0.6312500238418579, "rewards/_format_reward": 0.75, "step": 373 }, { "completion_length": 164.5, "epoch": 0.0935, "grad_norm": 0.740450918674469, "kl": 0.15864543616771698, "learning_rate": 4.675000000000001e-06, "loss": 0.0063, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 374 }, { "completion_length": 200.25, "epoch": 0.09375, "grad_norm": 0.7958067059516907, "kl": 0.35587310791015625, "learning_rate": 4.6875000000000004e-06, "loss": 0.0142, "reward": 1.125, "reward_std": 0.9910312294960022, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.5, "step": 375 }, { "completion_length": 119.125, "epoch": 0.094, "grad_norm": 0.9200760126113892, "kl": 0.07643859833478928, "learning_rate": 4.7e-06, "loss": 0.0031, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 376 }, { "completion_length": 120.75, "epoch": 0.09425, "grad_norm": 0.7784951329231262, "kl": 0.09547189623117447, "learning_rate": 4.7125e-06, "loss": 0.0038, "reward": 1.225000023841858, "reward_std": 0.8180989027023315, "rewards/_accuracy_reward": 0.4749999940395355, "rewards/_format_reward": 0.75, "step": 377 }, { "completion_length": 76.0, "epoch": 0.0945, "grad_norm": 1.4544790983200073, "kl": 0.16331760585308075, "learning_rate": 4.7250000000000005e-06, "loss": 0.0065, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 378 }, { "completion_length": 105.75, "epoch": 0.09475, "grad_norm": 0.9481674432754517, "kl": 0.1655467301607132, "learning_rate": 4.737500000000001e-06, "loss": 0.0066, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 379 }, { "completion_length": 117.625, "epoch": 0.095, "grad_norm": 1.2458184957504272, "kl": 0.12308754026889801, "learning_rate": 4.75e-06, "loss": 0.0049, "reward": 1.2625000476837158, "reward_std": 0.31707367300987244, "rewards/_accuracy_reward": 0.38749998807907104, "rewards/_format_reward": 0.875, "step": 380 }, { "completion_length": 141.875, "epoch": 0.09525, "grad_norm": 2.0550785064697266, "kl": 0.24554966390132904, "learning_rate": 4.7625000000000006e-06, "loss": 0.0098, "reward": 1.21875, "reward_std": 0.8284828662872314, "rewards/_accuracy_reward": 0.46875, "rewards/_format_reward": 0.75, "step": 381 }, { "completion_length": 134.375, "epoch": 0.0955, "grad_norm": 0.8591959476470947, "kl": 0.10585917532444, "learning_rate": 4.775e-06, "loss": 0.0042, "reward": 1.6387500762939453, "reward_std": 0.49872517585754395, "rewards/_accuracy_reward": 0.6387499570846558, "rewards/_format_reward": 1.0, "step": 382 }, { "completion_length": 114.25, "epoch": 0.09575, "grad_norm": 0.794729471206665, "kl": 0.24316060543060303, "learning_rate": 4.7875e-06, "loss": 0.0097, "reward": 1.0625, "reward_std": 0.5403371453285217, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.875, "step": 383 }, { "completion_length": 122.5, "epoch": 0.096, "grad_norm": 1.0003390312194824, "kl": 0.10567362606525421, "learning_rate": 4.800000000000001e-06, "loss": 0.0042, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 384 }, { "completion_length": 96.375, "epoch": 0.09625, "grad_norm": 0.9128509163856506, "kl": 0.09176424145698547, "learning_rate": 4.8125e-06, "loss": 0.0037, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 385 }, { "completion_length": 87.0, "epoch": 0.0965, "grad_norm": 0.8376882672309875, "kl": 0.08349757641553879, "learning_rate": 4.825e-06, "loss": 0.0033, "reward": 1.53125, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 1.0, "step": 386 }, { "completion_length": 116.125, "epoch": 0.09675, "grad_norm": 1.173961877822876, "kl": 0.08239471912384033, "learning_rate": 4.837500000000001e-06, "loss": 0.0033, "reward": 1.3624999523162842, "reward_std": 0.404218852519989, "rewards/_accuracy_reward": 0.36249998211860657, "rewards/_format_reward": 1.0, "step": 387 }, { "completion_length": 122.375, "epoch": 0.097, "grad_norm": 0.07301829755306244, "kl": 0.10658504068851471, "learning_rate": 4.85e-06, "loss": 0.0043, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 388 }, { "completion_length": 89.125, "epoch": 0.09725, "grad_norm": 0.2543293237686157, "kl": 0.1679508537054062, "learning_rate": 4.8625000000000005e-06, "loss": 0.0067, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 389 }, { "completion_length": 116.125, "epoch": 0.0975, "grad_norm": 0.9531389474868774, "kl": 0.10527552664279938, "learning_rate": 4.875e-06, "loss": 0.0042, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 390 }, { "completion_length": 118.875, "epoch": 0.09775, "grad_norm": 1.4362037181854248, "kl": 0.2108982801437378, "learning_rate": 4.8875e-06, "loss": 0.0084, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 391 }, { "completion_length": 112.25, "epoch": 0.098, "grad_norm": 0.7744788527488708, "kl": 0.09784118831157684, "learning_rate": 4.9000000000000005e-06, "loss": 0.0039, "reward": 1.662500023841858, "reward_std": 0.4711308181285858, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 0.875, "step": 392 }, { "completion_length": 125.75, "epoch": 0.09825, "grad_norm": 0.11483877152204514, "kl": 0.09153156727552414, "learning_rate": 4.912500000000001e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 393 }, { "completion_length": 119.25, "epoch": 0.0985, "grad_norm": 0.9200050234794617, "kl": 0.12948504090309143, "learning_rate": 4.925e-06, "loss": 0.0052, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 394 }, { "completion_length": 175.375, "epoch": 0.09875, "grad_norm": 0.7395903468132019, "kl": 0.12225595861673355, "learning_rate": 4.937500000000001e-06, "loss": 0.0049, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 395 }, { "completion_length": 122.5, "epoch": 0.099, "grad_norm": 0.6254833936691284, "kl": 0.0814957544207573, "learning_rate": 4.95e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 396 }, { "completion_length": 150.875, "epoch": 0.09925, "grad_norm": 1.205034613609314, "kl": 0.08875016123056412, "learning_rate": 4.9625e-06, "loss": 0.0036, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 397 }, { "completion_length": 161.125, "epoch": 0.0995, "grad_norm": 0.9321796894073486, "kl": 0.16619841754436493, "learning_rate": 4.975000000000001e-06, "loss": 0.0066, "reward": 1.7512500286102295, "reward_std": 0.4606032371520996, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 0.875, "step": 398 }, { "completion_length": 159.25, "epoch": 0.09975, "grad_norm": 0.8920565843582153, "kl": 0.12888400256633759, "learning_rate": 4.987500000000001e-06, "loss": 0.0052, "reward": 1.274999976158142, "reward_std": 0.6974443197250366, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.875, "step": 399 }, { "completion_length": 118.5, "epoch": 0.1, "grad_norm": 0.8786934018135071, "kl": 0.09915917366743088, "learning_rate": 5e-06, "loss": 0.004, "reward": 1.6875, "reward_std": 0.4381372928619385, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.875, "step": 400 }, { "completion_length": 77.625, "epoch": 0.10025, "grad_norm": 1.2590632438659668, "kl": 0.14523717761039734, "learning_rate": 4.999999048070624e-06, "loss": 0.0058, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 401 }, { "completion_length": 63.125, "epoch": 0.1005, "grad_norm": 1.158711314201355, "kl": 0.10946811735630035, "learning_rate": 4.99999619228322e-06, "loss": 0.0044, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 402 }, { "completion_length": 115.375, "epoch": 0.10075, "grad_norm": 1.5086749792099, "kl": 0.7569040656089783, "learning_rate": 4.999991432639962e-06, "loss": 0.0303, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 403 }, { "completion_length": 153.375, "epoch": 0.101, "grad_norm": 0.7452853322029114, "kl": 0.08908268809318542, "learning_rate": 4.999984769144476e-06, "loss": 0.0036, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.28749996423721313, "rewards/_format_reward": 1.0, "step": 404 }, { "completion_length": 83.75, "epoch": 0.10125, "grad_norm": 1.3645873069763184, "kl": 0.10320331156253815, "learning_rate": 4.999976201801837e-06, "loss": 0.0041, "reward": 1.125, "reward_std": 0.10350986570119858, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 1.0, "step": 405 }, { "completion_length": 72.25, "epoch": 0.1015, "grad_norm": 1.1549677848815918, "kl": 0.09673086553812027, "learning_rate": 4.999965730618567e-06, "loss": 0.0039, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 406 }, { "completion_length": 96.125, "epoch": 0.10175, "grad_norm": 1.009154200553894, "kl": 0.1394815593957901, "learning_rate": 4.999953355602643e-06, "loss": 0.0056, "reward": 1.399999976158142, "reward_std": 0.4971346855163574, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 0.875, "step": 407 }, { "completion_length": 125.375, "epoch": 0.102, "grad_norm": 0.9860761165618896, "kl": 0.13221774995326996, "learning_rate": 4.999939076763487e-06, "loss": 0.0053, "reward": 1.149999976158142, "reward_std": 0.8220185041427612, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.75, "step": 408 }, { "completion_length": 60.375, "epoch": 0.10225, "grad_norm": 0.053071990609169006, "kl": 0.0779157504439354, "learning_rate": 4.999922894111975e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 409 }, { "completion_length": 89.875, "epoch": 0.1025, "grad_norm": 0.9673873782157898, "kl": 0.08007065951824188, "learning_rate": 4.9999048076604286e-06, "loss": 0.0032, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 410 }, { "completion_length": 113.25, "epoch": 0.10275, "grad_norm": 0.6946626901626587, "kl": 0.09533973038196564, "learning_rate": 4.9998848174226225e-06, "loss": 0.0038, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 411 }, { "completion_length": 96.125, "epoch": 0.103, "grad_norm": 0.05451874062418938, "kl": 0.07452833652496338, "learning_rate": 4.999862923413781e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 412 }, { "completion_length": 72.0, "epoch": 0.10325, "grad_norm": 0.9213165044784546, "kl": 0.1262635439634323, "learning_rate": 4.999839125650576e-06, "loss": 0.0051, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 413 }, { "completion_length": 67.125, "epoch": 0.1035, "grad_norm": 1.0785224437713623, "kl": 0.0754222422838211, "learning_rate": 4.9998134241511305e-06, "loss": 0.003, "reward": 1.3125, "reward_std": 0.4299086928367615, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 1.0, "step": 414 }, { "completion_length": 157.125, "epoch": 0.10375, "grad_norm": 1.1292070150375366, "kl": 0.23137228190898895, "learning_rate": 4.999785818935018e-06, "loss": 0.0093, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 415 }, { "completion_length": 142.25, "epoch": 0.104, "grad_norm": 0.8795533180236816, "kl": 0.10361534357070923, "learning_rate": 4.999756310023261e-06, "loss": 0.0041, "reward": 1.3125, "reward_std": 0.873723566532135, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.75, "step": 416 }, { "completion_length": 116.0, "epoch": 0.10425, "grad_norm": 0.8660876750946045, "kl": 0.06504642218351364, "learning_rate": 4.999724897438332e-06, "loss": 0.0026, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 417 }, { "completion_length": 76.125, "epoch": 0.1045, "grad_norm": 0.8043219447135925, "kl": 0.11721104383468628, "learning_rate": 4.9996915812041515e-06, "loss": 0.0047, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 418 }, { "completion_length": 134.5, "epoch": 0.10475, "grad_norm": 0.8124399185180664, "kl": 0.1422225832939148, "learning_rate": 4.999656361346094e-06, "loss": 0.0057, "reward": 1.2687499523162842, "reward_std": 0.87257319688797, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.75, "step": 419 }, { "completion_length": 113.125, "epoch": 0.105, "grad_norm": 0.2114391177892685, "kl": 0.1519714593887329, "learning_rate": 4.9996192378909785e-06, "loss": 0.0061, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 420 }, { "completion_length": 89.25, "epoch": 0.10525, "grad_norm": 0.5262160301208496, "kl": 0.061721399426460266, "learning_rate": 4.9995802108670775e-06, "loss": 0.0025, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 421 }, { "completion_length": 101.125, "epoch": 0.1055, "grad_norm": 0.956619918346405, "kl": 0.07579920440912247, "learning_rate": 4.999539280304111e-06, "loss": 0.003, "reward": 1.756250023841858, "reward_std": 0.45153507590293884, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 422 }, { "completion_length": 110.875, "epoch": 0.10575, "grad_norm": 1.250651478767395, "kl": 0.1732567399740219, "learning_rate": 4.999496446233249e-06, "loss": 0.0069, "reward": 1.568750023841858, "reward_std": 0.4689939618110657, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 0.875, "step": 423 }, { "completion_length": 121.5, "epoch": 0.106, "grad_norm": 0.8256289958953857, "kl": 0.1623517870903015, "learning_rate": 4.999451708687114e-06, "loss": 0.0065, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 424 }, { "completion_length": 83.125, "epoch": 0.10625, "grad_norm": 0.12795695662498474, "kl": 0.08748316764831543, "learning_rate": 4.999405067699773e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 425 }, { "completion_length": 69.5, "epoch": 0.1065, "grad_norm": 1.4006377458572388, "kl": 0.09922429174184799, "learning_rate": 4.999356523306746e-06, "loss": 0.004, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.28749996423721313, "rewards/_format_reward": 1.0, "step": 426 }, { "completion_length": 68.375, "epoch": 0.10675, "grad_norm": 1.326810598373413, "kl": 0.10130297392606735, "learning_rate": 4.999306075545002e-06, "loss": 0.0041, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 427 }, { "completion_length": 106.75, "epoch": 0.107, "grad_norm": 1.5847952365875244, "kl": 0.14548355340957642, "learning_rate": 4.9992537244529585e-06, "loss": 0.0058, "reward": 1.53125, "reward_std": 0.8705242872238159, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 428 }, { "completion_length": 76.75, "epoch": 0.10725, "grad_norm": 1.0535387992858887, "kl": 0.08334135264158249, "learning_rate": 4.999199470070484e-06, "loss": 0.0033, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 429 }, { "completion_length": 105.75, "epoch": 0.1075, "grad_norm": 0.04739993438124657, "kl": 0.05667338892817497, "learning_rate": 4.999143312438893e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 430 }, { "completion_length": 97.375, "epoch": 0.10775, "grad_norm": 0.050007615238428116, "kl": 0.09176231920719147, "learning_rate": 4.9990852516009556e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 431 }, { "completion_length": 68.5, "epoch": 0.108, "grad_norm": 1.226663589477539, "kl": 0.08610677719116211, "learning_rate": 4.999025287600886e-06, "loss": 0.0034, "reward": 1.318750023841858, "reward_std": 0.2840240001678467, "rewards/_accuracy_reward": 0.3187499940395355, "rewards/_format_reward": 1.0, "step": 432 }, { "completion_length": 135.125, "epoch": 0.10825, "grad_norm": 0.9774854183197021, "kl": 0.060582954436540604, "learning_rate": 4.998963420484349e-06, "loss": 0.0024, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 433 }, { "completion_length": 108.25, "epoch": 0.1085, "grad_norm": 0.0652063712477684, "kl": 0.0928327664732933, "learning_rate": 4.9988996502984604e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 434 }, { "completion_length": 116.5, "epoch": 0.10875, "grad_norm": 0.066441610455513, "kl": 0.08606898039579391, "learning_rate": 4.998833977091783e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 435 }, { "completion_length": 76.25, "epoch": 0.109, "grad_norm": 0.06395157426595688, "kl": 0.08062107861042023, "learning_rate": 4.998766400914329e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 436 }, { "completion_length": 112.375, "epoch": 0.10925, "grad_norm": 0.9348214864730835, "kl": 0.11126627773046494, "learning_rate": 4.998696921817562e-06, "loss": 0.0045, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 437 }, { "completion_length": 98.5, "epoch": 0.1095, "grad_norm": 0.9661799669265747, "kl": 0.06792579591274261, "learning_rate": 4.998625539854394e-06, "loss": 0.0027, "reward": 1.5499999523162842, "reward_std": 0.4855041801929474, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 438 }, { "completion_length": 121.625, "epoch": 0.10975, "grad_norm": 0.9441984295845032, "kl": 0.06458202749490738, "learning_rate": 4.998552255079182e-06, "loss": 0.0026, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 1.0, "step": 439 }, { "completion_length": 135.375, "epoch": 0.11, "grad_norm": 1.015769124031067, "kl": 0.05865727737545967, "learning_rate": 4.99847706754774e-06, "loss": 0.0023, "reward": 1.5625, "reward_std": 0.47715675830841064, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.75, "step": 440 }, { "completion_length": 156.25, "epoch": 0.11025, "grad_norm": 0.6271827220916748, "kl": 0.08216940611600876, "learning_rate": 4.998399977317323e-06, "loss": 0.0033, "reward": 1.5012500286102295, "reward_std": 0.42089828848838806, "rewards/_accuracy_reward": 0.5012500286102295, "rewards/_format_reward": 1.0, "step": 441 }, { "completion_length": 143.5, "epoch": 0.1105, "grad_norm": 0.7138703465461731, "kl": 0.09817658364772797, "learning_rate": 4.9983209844466404e-06, "loss": 0.0039, "reward": 1.7512500286102295, "reward_std": 0.7035712003707886, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 0.875, "step": 442 }, { "completion_length": 126.75, "epoch": 0.11075, "grad_norm": 0.5848399996757507, "kl": 0.06327465921640396, "learning_rate": 4.9982400889958494e-06, "loss": 0.0025, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 443 }, { "completion_length": 126.75, "epoch": 0.111, "grad_norm": 1.716501235961914, "kl": 0.430374413728714, "learning_rate": 4.998157291026553e-06, "loss": 0.0172, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.625, "step": 444 }, { "completion_length": 97.375, "epoch": 0.11125, "grad_norm": 1.0113590955734253, "kl": 0.08199731260538101, "learning_rate": 4.998072590601808e-06, "loss": 0.0033, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 445 }, { "completion_length": 163.5, "epoch": 0.1115, "grad_norm": 0.5446489453315735, "kl": 0.08332154899835587, "learning_rate": 4.9979859877861155e-06, "loss": 0.0033, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 446 }, { "completion_length": 105.25, "epoch": 0.11175, "grad_norm": 0.846171498298645, "kl": 0.08622215688228607, "learning_rate": 4.997897482645428e-06, "loss": 0.0034, "reward": 1.4249999523162842, "reward_std": 0.481812059879303, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 0.875, "step": 447 }, { "completion_length": 116.25, "epoch": 0.112, "grad_norm": 0.7110444903373718, "kl": 0.05666430667042732, "learning_rate": 4.997807075247147e-06, "loss": 0.0023, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 448 }, { "completion_length": 92.125, "epoch": 0.11225, "grad_norm": 1.340246558189392, "kl": 0.09173227101564407, "learning_rate": 4.9977147656601196e-06, "loss": 0.0037, "reward": 1.5187499523162842, "reward_std": 0.514738142490387, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 0.875, "step": 449 }, { "completion_length": 92.25, "epoch": 0.1125, "grad_norm": 1.8258260488510132, "kl": 0.14632035791873932, "learning_rate": 4.997620553954645e-06, "loss": 0.0059, "reward": 1.475000023841858, "reward_std": 0.6419835090637207, "rewards/_accuracy_reward": 0.6000000238418579, "rewards/_format_reward": 0.875, "step": 450 }, { "completion_length": 172.625, "epoch": 0.11275, "grad_norm": 0.7036133408546448, "kl": 0.07693489640951157, "learning_rate": 4.997524440202469e-06, "loss": 0.0031, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.625, "step": 451 }, { "completion_length": 168.0, "epoch": 0.113, "grad_norm": 0.6177405118942261, "kl": 0.08349208533763885, "learning_rate": 4.997426424476787e-06, "loss": 0.0033, "reward": 1.2949999570846558, "reward_std": 0.6939947009086609, "rewards/_accuracy_reward": 0.41999998688697815, "rewards/_format_reward": 0.875, "step": 452 }, { "completion_length": 161.25, "epoch": 0.11325, "grad_norm": 0.7319234013557434, "kl": 0.1011638194322586, "learning_rate": 4.997326506852242e-06, "loss": 0.004, "reward": 1.28125, "reward_std": 0.9949650168418884, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.625, "step": 453 }, { "completion_length": 143.25, "epoch": 0.1135, "grad_norm": 0.7980057597160339, "kl": 0.08353295922279358, "learning_rate": 4.9972246874049254e-06, "loss": 0.0033, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 454 }, { "completion_length": 138.125, "epoch": 0.11375, "grad_norm": 0.7856406569480896, "kl": 0.09998480975627899, "learning_rate": 4.9971209662123774e-06, "loss": 0.004, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 455 }, { "completion_length": 143.75, "epoch": 0.114, "grad_norm": 0.48554107546806335, "kl": 0.05186166986823082, "learning_rate": 4.9970153433535855e-06, "loss": 0.0021, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 456 }, { "completion_length": 79.625, "epoch": 0.11425, "grad_norm": 0.03818913921713829, "kl": 0.07487285137176514, "learning_rate": 4.996907818908987e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 457 }, { "completion_length": 117.75, "epoch": 0.1145, "grad_norm": 0.8235962390899658, "kl": 0.0846768468618393, "learning_rate": 4.996798392960466e-06, "loss": 0.0034, "reward": 1.53125, "reward_std": 0.8705242872238159, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 458 }, { "completion_length": 204.0, "epoch": 0.11475, "grad_norm": 0.5379483103752136, "kl": 0.09781916439533234, "learning_rate": 4.996687065591355e-06, "loss": 0.0039, "reward": 1.2937500476837158, "reward_std": 0.6945900321006775, "rewards/_accuracy_reward": 0.543749988079071, "rewards/_format_reward": 0.75, "step": 459 }, { "completion_length": 136.75, "epoch": 0.115, "grad_norm": 0.7759735584259033, "kl": 0.0745742917060852, "learning_rate": 4.9965738368864345e-06, "loss": 0.003, "reward": 1.568750023841858, "reward_std": 0.6589697003364563, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 0.875, "step": 460 }, { "completion_length": 87.0, "epoch": 0.11525, "grad_norm": 1.2910523414611816, "kl": 0.07611493766307831, "learning_rate": 4.996458706931935e-06, "loss": 0.003, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 461 }, { "completion_length": 133.375, "epoch": 0.1155, "grad_norm": 0.7070830464363098, "kl": 0.10267248749732971, "learning_rate": 4.99634167581553e-06, "loss": 0.0041, "reward": 1.6875, "reward_std": 0.6373774409294128, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.875, "step": 462 }, { "completion_length": 107.625, "epoch": 0.11575, "grad_norm": 0.16717670857906342, "kl": 0.13669300079345703, "learning_rate": 4.996222743626346e-06, "loss": 0.0055, "reward": 1.25, "reward_std": 0.0, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 1.0, "step": 463 }, { "completion_length": 74.125, "epoch": 0.116, "grad_norm": 1.4139927625656128, "kl": 0.1574665755033493, "learning_rate": 4.996101910454953e-06, "loss": 0.0063, "reward": 1.34375, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.34375, "rewards/_format_reward": 1.0, "step": 464 }, { "completion_length": 151.375, "epoch": 0.11625, "grad_norm": 0.7854687571525574, "kl": 0.10802503675222397, "learning_rate": 4.995979176393372e-06, "loss": 0.0043, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 465 }, { "completion_length": 136.75, "epoch": 0.1165, "grad_norm": 0.6209039688110352, "kl": 0.1174168810248375, "learning_rate": 4.995854541535072e-06, "loss": 0.0047, "reward": 1.568750023841858, "reward_std": 0.7009878158569336, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 0.875, "step": 466 }, { "completion_length": 147.25, "epoch": 0.11675, "grad_norm": 0.5902235507965088, "kl": 0.10785003006458282, "learning_rate": 4.995728005974964e-06, "loss": 0.0043, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.2812499701976776, "rewards/_format_reward": 0.875, "step": 467 }, { "completion_length": 133.375, "epoch": 0.117, "grad_norm": 0.8462874889373779, "kl": 0.2404460906982422, "learning_rate": 4.995599569809414e-06, "loss": 0.0096, "reward": 1.274999976158142, "reward_std": 0.6974443197250366, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.875, "step": 468 }, { "completion_length": 131.125, "epoch": 0.11725, "grad_norm": 0.5348749756813049, "kl": 0.0859207808971405, "learning_rate": 4.9954692331362295e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 469 }, { "completion_length": 141.25, "epoch": 0.1175, "grad_norm": 0.6675086617469788, "kl": 0.21905027329921722, "learning_rate": 4.995336996054668e-06, "loss": 0.0088, "reward": 1.78125, "reward_std": 0.6187184453010559, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 470 }, { "completion_length": 135.25, "epoch": 0.11775, "grad_norm": 0.5653254985809326, "kl": 0.1933996081352234, "learning_rate": 4.995202858665434e-06, "loss": 0.0077, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 471 }, { "completion_length": 155.875, "epoch": 0.118, "grad_norm": 0.1728929877281189, "kl": 0.16971825063228607, "learning_rate": 4.9950668210706795e-06, "loss": 0.0068, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 472 }, { "completion_length": 130.5, "epoch": 0.11825, "grad_norm": 0.8266690969467163, "kl": 0.0714937224984169, "learning_rate": 4.9949288833740016e-06, "loss": 0.0029, "reward": 1.3312499523162842, "reward_std": 0.671319305896759, "rewards/_accuracy_reward": 0.45624998211860657, "rewards/_format_reward": 0.875, "step": 473 }, { "completion_length": 111.125, "epoch": 0.1185, "grad_norm": 0.8442752957344055, "kl": 0.10157648473978043, "learning_rate": 4.994789045680448e-06, "loss": 0.0041, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 474 }, { "completion_length": 121.0, "epoch": 0.11875, "grad_norm": 0.6787703037261963, "kl": 0.0763428583741188, "learning_rate": 4.994647308096509e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 475 }, { "completion_length": 163.375, "epoch": 0.119, "grad_norm": 0.028817906975746155, "kl": 0.04729737713932991, "learning_rate": 4.994503670730126e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 476 }, { "completion_length": 121.5, "epoch": 0.11925, "grad_norm": 0.6946129202842712, "kl": 0.20381557941436768, "learning_rate": 4.994358133690683e-06, "loss": 0.0082, "reward": 1.3250000476837158, "reward_std": 0.6850443482398987, "rewards/_accuracy_reward": 0.44999998807907104, "rewards/_format_reward": 0.875, "step": 477 }, { "completion_length": 115.5, "epoch": 0.1195, "grad_norm": 16.712684631347656, "kl": 0.10329899191856384, "learning_rate": 4.9942106970890136e-06, "loss": 0.0041, "reward": 1.443750023841858, "reward_std": 0.7123590111732483, "rewards/_accuracy_reward": 0.5687500238418579, "rewards/_format_reward": 0.875, "step": 478 }, { "completion_length": 112.375, "epoch": 0.11975, "grad_norm": 0.08893012255430222, "kl": 0.07471595704555511, "learning_rate": 4.9940613610373974e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 479 }, { "completion_length": 161.625, "epoch": 0.12, "grad_norm": 0.5868445038795471, "kl": 0.08950529247522354, "learning_rate": 4.993910125649561e-06, "loss": 0.0036, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 480 }, { "completion_length": 141.625, "epoch": 0.12025, "grad_norm": 0.5067136883735657, "kl": 0.11062958091497421, "learning_rate": 4.993756991040676e-06, "loss": 0.0044, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 481 }, { "completion_length": 114.125, "epoch": 0.1205, "grad_norm": 0.1355297863483429, "kl": 0.15943318605422974, "learning_rate": 4.993601957327361e-06, "loss": 0.0064, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 482 }, { "completion_length": 163.0, "epoch": 0.12075, "grad_norm": 1.0221439599990845, "kl": 0.14336055517196655, "learning_rate": 4.99344502462768e-06, "loss": 0.0057, "reward": 1.131250023841858, "reward_std": 0.9902876615524292, "rewards/_accuracy_reward": 0.5062500238418579, "rewards/_format_reward": 0.625, "step": 483 }, { "completion_length": 182.375, "epoch": 0.121, "grad_norm": 0.9383127689361572, "kl": 0.12501807510852814, "learning_rate": 4.993286193061145e-06, "loss": 0.005, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.625, "step": 484 }, { "completion_length": 150.25, "epoch": 0.12125, "grad_norm": 1.1651649475097656, "kl": 0.10007007420063019, "learning_rate": 4.993125462748714e-06, "loss": 0.004, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 485 }, { "completion_length": 108.0, "epoch": 0.1215, "grad_norm": 0.028756048530340195, "kl": 0.06177728250622749, "learning_rate": 4.9929628338127904e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 486 }, { "completion_length": 85.0, "epoch": 0.12175, "grad_norm": 0.8098493814468384, "kl": 0.1391671895980835, "learning_rate": 4.9927983063772205e-06, "loss": 0.0056, "reward": 1.318750023841858, "reward_std": 0.28402402997016907, "rewards/_accuracy_reward": 0.3187499940395355, "rewards/_format_reward": 1.0, "step": 487 }, { "completion_length": 120.125, "epoch": 0.122, "grad_norm": 1.6718604564666748, "kl": 0.07330606877803802, "learning_rate": 4.992631880567301e-06, "loss": 0.0029, "reward": 1.537500023841858, "reward_std": 0.7322909235954285, "rewards/_accuracy_reward": 0.6625000238418579, "rewards/_format_reward": 0.875, "step": 488 }, { "completion_length": 136.375, "epoch": 0.12225, "grad_norm": 0.7673369646072388, "kl": 0.10325151681900024, "learning_rate": 4.992463556509772e-06, "loss": 0.0041, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 489 }, { "completion_length": 142.375, "epoch": 0.1225, "grad_norm": 1.1908446550369263, "kl": 0.1146053820848465, "learning_rate": 4.992293334332821e-06, "loss": 0.0046, "reward": 1.3875000476837158, "reward_std": 0.3879893124103546, "rewards/_accuracy_reward": 0.38749998807907104, "rewards/_format_reward": 1.0, "step": 490 }, { "completion_length": 78.625, "epoch": 0.12275, "grad_norm": 1.0218267440795898, "kl": 0.20096728205680847, "learning_rate": 4.992121214166077e-06, "loss": 0.008, "reward": 1.5499999523162842, "reward_std": 0.4855042099952698, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 491 }, { "completion_length": 115.125, "epoch": 0.123, "grad_norm": 0.7811625003814697, "kl": 0.07716540992259979, "learning_rate": 4.991947196140619e-06, "loss": 0.0031, "reward": 1.1687499284744263, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 1.0, "step": 492 }, { "completion_length": 176.375, "epoch": 0.12325, "grad_norm": 0.835605800151825, "kl": 0.18739749491214752, "learning_rate": 4.991771280388967e-06, "loss": 0.0075, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.875, "step": 493 }, { "completion_length": 117.625, "epoch": 0.1235, "grad_norm": 0.06079603359103203, "kl": 0.06408681720495224, "learning_rate": 4.991593467045092e-06, "loss": 0.0026, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 494 }, { "completion_length": 96.5, "epoch": 0.12375, "grad_norm": 0.0451425276696682, "kl": 0.05102433264255524, "learning_rate": 4.991413756244404e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 495 }, { "completion_length": 109.5, "epoch": 0.124, "grad_norm": 0.8194900751113892, "kl": 0.12500625848770142, "learning_rate": 4.9912321481237616e-06, "loss": 0.005, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 496 }, { "completion_length": 106.875, "epoch": 0.12425, "grad_norm": 0.21397340297698975, "kl": 0.1683153510093689, "learning_rate": 4.991048642821466e-06, "loss": 0.0067, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 497 }, { "completion_length": 122.375, "epoch": 0.1245, "grad_norm": 1.4960228204727173, "kl": 0.11880878359079361, "learning_rate": 4.990863240477266e-06, "loss": 0.0048, "reward": 1.5187499523162842, "reward_std": 0.514738142490387, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 0.875, "step": 498 }, { "completion_length": 150.125, "epoch": 0.12475, "grad_norm": 0.8118491768836975, "kl": 0.15849874913692474, "learning_rate": 4.990675941232353e-06, "loss": 0.0063, "reward": 0.9437499046325684, "reward_std": 0.38770151138305664, "rewards/_accuracy_reward": 0.06875000149011612, "rewards/_format_reward": 0.875, "step": 499 }, { "completion_length": 136.75, "epoch": 0.125, "grad_norm": 0.6899111866950989, "kl": 0.1458391398191452, "learning_rate": 4.990486745229364e-06, "loss": 0.0058, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 500 }, { "completion_length": 160.75, "epoch": 0.12525, "grad_norm": 0.6339772939682007, "kl": 0.16045698523521423, "learning_rate": 4.990295652612379e-06, "loss": 0.0064, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 501 }, { "completion_length": 113.375, "epoch": 0.1255, "grad_norm": 0.7603309154510498, "kl": 0.10802485048770905, "learning_rate": 4.990102663526925e-06, "loss": 0.0043, "reward": 1.21875, "reward_std": 0.4712729752063751, "rewards/_accuracy_reward": 0.34375, "rewards/_format_reward": 0.875, "step": 502 }, { "completion_length": 121.0, "epoch": 0.12575, "grad_norm": 0.5087311267852783, "kl": 0.08297999203205109, "learning_rate": 4.989907778119969e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 503 }, { "completion_length": 164.375, "epoch": 0.126, "grad_norm": 0.7062379121780396, "kl": 0.16635379195213318, "learning_rate": 4.989710996539926e-06, "loss": 0.0067, "reward": 1.40625, "reward_std": 0.7311622500419617, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.625, "step": 504 }, { "completion_length": 126.25, "epoch": 0.12625, "grad_norm": 0.9284823536872864, "kl": 0.07494886219501495, "learning_rate": 4.989512318936654e-06, "loss": 0.003, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 505 }, { "completion_length": 161.375, "epoch": 0.1265, "grad_norm": 0.6040777564048767, "kl": 0.14252394437789917, "learning_rate": 4.989311745461456e-06, "loss": 0.0057, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 506 }, { "completion_length": 130.25, "epoch": 0.12675, "grad_norm": 0.06497879326343536, "kl": 0.06290639191865921, "learning_rate": 4.989109276267074e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 507 }, { "completion_length": 135.875, "epoch": 0.127, "grad_norm": 0.672325074672699, "kl": 0.05711538344621658, "learning_rate": 4.9889049115077e-06, "loss": 0.0023, "reward": 1.46875, "reward_std": 0.6999680995941162, "rewards/_accuracy_reward": 0.59375, "rewards/_format_reward": 0.875, "step": 508 }, { "completion_length": 184.875, "epoch": 0.12725, "grad_norm": 0.994357705116272, "kl": 0.13488022983074188, "learning_rate": 4.988698651338965e-06, "loss": 0.0054, "reward": 1.125, "reward_std": 0.9910312294960022, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.5, "step": 509 }, { "completion_length": 155.75, "epoch": 0.1275, "grad_norm": 0.030594639480113983, "kl": 0.06842894107103348, "learning_rate": 4.988490495917948e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 510 }, { "completion_length": 163.25, "epoch": 0.12775, "grad_norm": 0.887022852897644, "kl": 0.19011199474334717, "learning_rate": 4.988280445403164e-06, "loss": 0.0076, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 511 }, { "completion_length": 150.5, "epoch": 0.128, "grad_norm": 0.8398552536964417, "kl": 0.154772087931633, "learning_rate": 4.988068499954578e-06, "loss": 0.0062, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 512 }, { "completion_length": 143.0, "epoch": 0.12825, "grad_norm": 0.8649221062660217, "kl": 0.07180348038673401, "learning_rate": 4.987854659733597e-06, "loss": 0.0029, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 513 }, { "completion_length": 183.75, "epoch": 0.1285, "grad_norm": 0.36669495701789856, "kl": 0.0827406495809555, "learning_rate": 4.987638924903066e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 514 }, { "completion_length": 133.875, "epoch": 0.12875, "grad_norm": 0.7541155219078064, "kl": 0.09912529587745667, "learning_rate": 4.987421295627279e-06, "loss": 0.004, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 515 }, { "completion_length": 126.125, "epoch": 0.129, "grad_norm": 1.4548908472061157, "kl": 0.16541288793087006, "learning_rate": 4.987201772071971e-06, "loss": 0.0066, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 516 }, { "completion_length": 124.375, "epoch": 0.12925, "grad_norm": 0.8262905478477478, "kl": 0.21291877329349518, "learning_rate": 4.986980354404316e-06, "loss": 0.0085, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.2812499701976776, "rewards/_format_reward": 0.875, "step": 517 }, { "completion_length": 94.625, "epoch": 0.1295, "grad_norm": 1.0423915386199951, "kl": 0.10086517781019211, "learning_rate": 4.9867570427929356e-06, "loss": 0.004, "reward": 1.693750023841858, "reward_std": 0.42714792490005493, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 1.0, "step": 518 }, { "completion_length": 137.125, "epoch": 0.12975, "grad_norm": 0.5806828141212463, "kl": 0.1275208294391632, "learning_rate": 4.986531837407891e-06, "loss": 0.0051, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 519 }, { "completion_length": 115.5, "epoch": 0.13, "grad_norm": 1.0486258268356323, "kl": 0.2358919382095337, "learning_rate": 4.986304738420684e-06, "loss": 0.0094, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.875, "step": 520 }, { "completion_length": 102.75, "epoch": 0.13025, "grad_norm": 0.8502905368804932, "kl": 0.11098451167345047, "learning_rate": 4.986075746004262e-06, "loss": 0.0044, "reward": 1.53125, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 1.0, "step": 521 }, { "completion_length": 123.0, "epoch": 0.1305, "grad_norm": 0.04550248757004738, "kl": 0.10682176798582077, "learning_rate": 4.985844860333012e-06, "loss": 0.0043, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 522 }, { "completion_length": 158.0, "epoch": 0.13075, "grad_norm": 0.6766717433929443, "kl": 0.04881787300109863, "learning_rate": 4.985612081582763e-06, "loss": 0.002, "reward": 0.9187499284744263, "reward_std": 0.3712310194969177, "rewards/_accuracy_reward": 0.04375000298023224, "rewards/_format_reward": 0.875, "step": 523 }, { "completion_length": 189.25, "epoch": 0.131, "grad_norm": 0.7428447604179382, "kl": 0.1956254243850708, "learning_rate": 4.985377409930789e-06, "loss": 0.0078, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 524 }, { "completion_length": 97.25, "epoch": 0.13125, "grad_norm": 0.7122312784194946, "kl": 0.08156180381774902, "learning_rate": 4.985140845555799e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 525 }, { "completion_length": 187.75, "epoch": 0.1315, "grad_norm": 0.7845567464828491, "kl": 0.1910717934370041, "learning_rate": 4.98490238863795e-06, "loss": 0.0076, "reward": 1.0325000286102295, "reward_std": 0.9299885034561157, "rewards/_accuracy_reward": 0.4074999988079071, "rewards/_format_reward": 0.625, "step": 526 }, { "completion_length": 183.0, "epoch": 0.13175, "grad_norm": 0.7677600979804993, "kl": 0.17733940482139587, "learning_rate": 4.984662039358835e-06, "loss": 0.0071, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 527 }, { "completion_length": 142.375, "epoch": 0.132, "grad_norm": 0.6115749478340149, "kl": 0.21568211913108826, "learning_rate": 4.984419797901491e-06, "loss": 0.0086, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 528 }, { "completion_length": 104.75, "epoch": 0.13225, "grad_norm": 0.6236662268638611, "kl": 0.09865246713161469, "learning_rate": 4.9841756644503965e-06, "loss": 0.0039, "reward": 1.0374999046325684, "reward_std": 0.5350233316421509, "rewards/_accuracy_reward": 0.16250000894069672, "rewards/_format_reward": 0.875, "step": 529 }, { "completion_length": 85.75, "epoch": 0.1325, "grad_norm": 1.077025294303894, "kl": 0.11589670181274414, "learning_rate": 4.9839296391914696e-06, "loss": 0.0046, "reward": 1.190000057220459, "reward_std": 0.11109840869903564, "rewards/_accuracy_reward": 0.1899999976158142, "rewards/_format_reward": 1.0, "step": 530 }, { "completion_length": 125.125, "epoch": 0.13275, "grad_norm": 0.9445363283157349, "kl": 0.12175693362951279, "learning_rate": 4.983681722312068e-06, "loss": 0.0049, "reward": 1.3250000476837158, "reward_std": 0.6850443482398987, "rewards/_accuracy_reward": 0.44999998807907104, "rewards/_format_reward": 0.875, "step": 531 }, { "completion_length": 148.875, "epoch": 0.133, "grad_norm": 0.7169702053070068, "kl": 0.0948939323425293, "learning_rate": 4.983431914000991e-06, "loss": 0.0038, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 532 }, { "completion_length": 131.625, "epoch": 0.13325, "grad_norm": 0.7590733766555786, "kl": 0.08492686599493027, "learning_rate": 4.983180214448481e-06, "loss": 0.0034, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 533 }, { "completion_length": 120.375, "epoch": 0.1335, "grad_norm": 0.895210325717926, "kl": 0.12446754425764084, "learning_rate": 4.982926623846216e-06, "loss": 0.005, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 534 }, { "completion_length": 85.25, "epoch": 0.13375, "grad_norm": 1.56959068775177, "kl": 0.09629300981760025, "learning_rate": 4.982671142387316e-06, "loss": 0.0039, "reward": 1.34375, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.34375, "rewards/_format_reward": 1.0, "step": 535 }, { "completion_length": 105.75, "epoch": 0.134, "grad_norm": 1.4607653617858887, "kl": 0.20348867774009705, "learning_rate": 4.9824137702663424e-06, "loss": 0.0081, "reward": 1.5950000286102295, "reward_std": 0.4400324523448944, "rewards/_accuracy_reward": 0.5950000286102295, "rewards/_format_reward": 1.0, "step": 536 }, { "completion_length": 100.375, "epoch": 0.13425, "grad_norm": 0.8848427534103394, "kl": 0.21129947900772095, "learning_rate": 4.982154507679296e-06, "loss": 0.0085, "reward": 1.2062499523162842, "reward_std": 0.6315725445747375, "rewards/_accuracy_reward": 0.33125001192092896, "rewards/_format_reward": 0.875, "step": 537 }, { "completion_length": 166.875, "epoch": 0.1345, "grad_norm": 1.3390650749206543, "kl": 0.2360527515411377, "learning_rate": 4.981893354823614e-06, "loss": 0.0094, "reward": 1.193750023841858, "reward_std": 0.8304204940795898, "rewards/_accuracy_reward": 0.4437499940395355, "rewards/_format_reward": 0.75, "step": 538 }, { "completion_length": 165.5, "epoch": 0.13475, "grad_norm": 0.8016129732131958, "kl": 0.16088564693927765, "learning_rate": 4.981630311898178e-06, "loss": 0.0064, "reward": 1.149999976158142, "reward_std": 0.6358346939086914, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.75, "step": 539 }, { "completion_length": 131.625, "epoch": 0.135, "grad_norm": 1.0546795129776, "kl": 0.13393646478652954, "learning_rate": 4.981365379103306e-06, "loss": 0.0054, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 540 }, { "completion_length": 82.0, "epoch": 0.13525, "grad_norm": 0.7583869695663452, "kl": 0.15737389028072357, "learning_rate": 4.981098556640754e-06, "loss": 0.0063, "reward": 1.0749999284744263, "reward_std": 0.0707106813788414, "rewards/_accuracy_reward": 0.07500000298023224, "rewards/_format_reward": 1.0, "step": 541 }, { "completion_length": 77.375, "epoch": 0.1355, "grad_norm": 1.210227370262146, "kl": 0.14557315409183502, "learning_rate": 4.980829844713722e-06, "loss": 0.0058, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 542 }, { "completion_length": 117.125, "epoch": 0.13575, "grad_norm": 0.8977431654930115, "kl": 0.10112278908491135, "learning_rate": 4.980559243526844e-06, "loss": 0.004, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 543 }, { "completion_length": 110.375, "epoch": 0.136, "grad_norm": 0.9589873552322388, "kl": 0.23000037670135498, "learning_rate": 4.980286753286196e-06, "loss": 0.0092, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 544 }, { "completion_length": 136.5, "epoch": 0.13625, "grad_norm": 1.4247018098831177, "kl": 0.224748432636261, "learning_rate": 4.980012374199288e-06, "loss": 0.009, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 545 }, { "completion_length": 109.0, "epoch": 0.1365, "grad_norm": 0.8289728760719299, "kl": 0.1731492429971695, "learning_rate": 4.979736106475075e-06, "loss": 0.0069, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.28749996423721313, "rewards/_format_reward": 1.0, "step": 546 }, { "completion_length": 161.375, "epoch": 0.13675, "grad_norm": 0.7212539911270142, "kl": 0.1648021787405014, "learning_rate": 4.979457950323945e-06, "loss": 0.0066, "reward": 1.28125, "reward_std": 0.44395744800567627, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 0.875, "step": 547 }, { "completion_length": 133.25, "epoch": 0.137, "grad_norm": 2.3475310802459717, "kl": 0.3754054307937622, "learning_rate": 4.979177905957726e-06, "loss": 0.015, "reward": 0.78125, "reward_std": 0.7100993394851685, "rewards/_accuracy_reward": 0.1562499850988388, "rewards/_format_reward": 0.625, "step": 548 }, { "completion_length": 162.25, "epoch": 0.13725, "grad_norm": 0.7575663328170776, "kl": 0.1953345090150833, "learning_rate": 4.978895973589686e-06, "loss": 0.0078, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 549 }, { "completion_length": 175.0, "epoch": 0.1375, "grad_norm": 2.3825697898864746, "kl": 0.20287609100341797, "learning_rate": 4.978612153434527e-06, "loss": 0.0081, "reward": 1.1687500476837158, "reward_std": 0.8314949870109558, "rewards/_accuracy_reward": 0.41874998807907104, "rewards/_format_reward": 0.75, "step": 550 }, { "completion_length": 82.75, "epoch": 0.13775, "grad_norm": 1.088090419769287, "kl": 0.24210913479328156, "learning_rate": 4.97832644570839e-06, "loss": 0.0097, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 551 }, { "completion_length": 182.625, "epoch": 0.138, "grad_norm": 0.5995256304740906, "kl": 0.1471828818321228, "learning_rate": 4.978038850628855e-06, "loss": 0.0059, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.625, "step": 552 }, { "completion_length": 112.5, "epoch": 0.13825, "grad_norm": 0.835372805595398, "kl": 0.12362098693847656, "learning_rate": 4.977749368414938e-06, "loss": 0.0049, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 553 }, { "completion_length": 102.625, "epoch": 0.1385, "grad_norm": 0.08654724806547165, "kl": 0.16423742473125458, "learning_rate": 4.977457999287091e-06, "loss": 0.0066, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 554 }, { "completion_length": 175.125, "epoch": 0.13875, "grad_norm": 0.7749510407447815, "kl": 0.2203240692615509, "learning_rate": 4.977164743467206e-06, "loss": 0.0088, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.625, "step": 555 }, { "completion_length": 117.625, "epoch": 0.139, "grad_norm": 1.410696268081665, "kl": 0.2296142429113388, "learning_rate": 4.9768696011786095e-06, "loss": 0.0092, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 556 }, { "completion_length": 157.625, "epoch": 0.13925, "grad_norm": 0.8265271186828613, "kl": 0.2630419433116913, "learning_rate": 4.976572572646064e-06, "loss": 0.0105, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 557 }, { "completion_length": 122.875, "epoch": 0.1395, "grad_norm": 0.7778673768043518, "kl": 0.22738924622535706, "learning_rate": 4.976273658095772e-06, "loss": 0.0091, "reward": 1.506250023841858, "reward_std": 0.7513975501060486, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.75, "step": 558 }, { "completion_length": 88.5, "epoch": 0.13975, "grad_norm": 0.042239658534526825, "kl": 0.07109732180833817, "learning_rate": 4.975972857755369e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 559 }, { "completion_length": 87.125, "epoch": 0.14, "grad_norm": 1.4101208448410034, "kl": 0.22331476211547852, "learning_rate": 4.975670171853926e-06, "loss": 0.0089, "reward": 1.78125, "reward_std": 0.6187184453010559, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 560 }, { "completion_length": 88.625, "epoch": 0.14025, "grad_norm": 0.799018383026123, "kl": 0.09630677103996277, "learning_rate": 4.975365600621953e-06, "loss": 0.0039, "reward": 1.1687499284744263, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 1.0, "step": 561 }, { "completion_length": 185.0, "epoch": 0.1405, "grad_norm": 0.6921383738517761, "kl": 0.14386911690235138, "learning_rate": 4.975059144291395e-06, "loss": 0.0058, "reward": 1.1875, "reward_std": 0.5724321603775024, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 0.875, "step": 562 }, { "completion_length": 141.375, "epoch": 0.14075, "grad_norm": 0.7761791944503784, "kl": 0.1153935045003891, "learning_rate": 4.974750803095629e-06, "loss": 0.0046, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 563 }, { "completion_length": 120.25, "epoch": 0.141, "grad_norm": 0.2582365572452545, "kl": 0.17987042665481567, "learning_rate": 4.974440577269473e-06, "loss": 0.0072, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 564 }, { "completion_length": 142.0, "epoch": 0.14125, "grad_norm": 1.3962867259979248, "kl": 0.17358022928237915, "learning_rate": 4.974128467049177e-06, "loss": 0.0069, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 565 }, { "completion_length": 140.5, "epoch": 0.1415, "grad_norm": 0.7365003824234009, "kl": 0.09812232851982117, "learning_rate": 4.973814472672424e-06, "loss": 0.0039, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 566 }, { "completion_length": 137.375, "epoch": 0.14175, "grad_norm": 0.8879725337028503, "kl": 0.2860892415046692, "learning_rate": 4.973498594378338e-06, "loss": 0.0114, "reward": 1.399999976158142, "reward_std": 0.4971346855163574, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 0.875, "step": 567 }, { "completion_length": 99.125, "epoch": 0.142, "grad_norm": 0.7891566157341003, "kl": 0.12184718996286392, "learning_rate": 4.973180832407471e-06, "loss": 0.0049, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 568 }, { "completion_length": 81.25, "epoch": 0.14225, "grad_norm": 0.03716867417097092, "kl": 0.054770611226558685, "learning_rate": 4.972861187001815e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 569 }, { "completion_length": 128.125, "epoch": 0.1425, "grad_norm": 1.0409587621688843, "kl": 0.14994631707668304, "learning_rate": 4.972539658404793e-06, "loss": 0.006, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 570 }, { "completion_length": 149.125, "epoch": 0.14275, "grad_norm": 0.9049003720283508, "kl": 0.0754564180970192, "learning_rate": 4.9722162468612625e-06, "loss": 0.003, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 571 }, { "completion_length": 171.25, "epoch": 0.143, "grad_norm": 0.8585514426231384, "kl": 0.0837872251868248, "learning_rate": 4.971890952617515e-06, "loss": 0.0034, "reward": 1.3762500286102295, "reward_std": 0.915547251701355, "rewards/_accuracy_reward": 0.6262500286102295, "rewards/_format_reward": 0.75, "step": 572 }, { "completion_length": 81.625, "epoch": 0.14325, "grad_norm": 1.177064061164856, "kl": 0.10382416099309921, "learning_rate": 4.9715637759212775e-06, "loss": 0.0042, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 573 }, { "completion_length": 143.875, "epoch": 0.1435, "grad_norm": 1.1178721189498901, "kl": 0.17203757166862488, "learning_rate": 4.971234717021709e-06, "loss": 0.0069, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 574 }, { "completion_length": 149.625, "epoch": 0.14375, "grad_norm": 0.05728991702198982, "kl": 0.07802347093820572, "learning_rate": 4.970903776169403e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 575 }, { "completion_length": 104.125, "epoch": 0.144, "grad_norm": 0.9672228097915649, "kl": 0.12484028190374374, "learning_rate": 4.970570953616383e-06, "loss": 0.005, "reward": 1.4562499523162842, "reward_std": 0.45781898498535156, "rewards/_accuracy_reward": 0.45624998211860657, "rewards/_format_reward": 1.0, "step": 576 }, { "completion_length": 136.625, "epoch": 0.14425, "grad_norm": 1.2664356231689453, "kl": 0.21360184252262115, "learning_rate": 4.970236249616109e-06, "loss": 0.0085, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 577 }, { "completion_length": 144.75, "epoch": 0.1445, "grad_norm": 1.266003966331482, "kl": 0.10478426516056061, "learning_rate": 4.969899664423473e-06, "loss": 0.0042, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 578 }, { "completion_length": 145.625, "epoch": 0.14475, "grad_norm": 0.06886684149503708, "kl": 0.08477036654949188, "learning_rate": 4.9695611982947995e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 579 }, { "completion_length": 109.0, "epoch": 0.145, "grad_norm": 0.047261860221624374, "kl": 0.08284247666597366, "learning_rate": 4.9692208514878445e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 580 }, { "completion_length": 134.875, "epoch": 0.14525, "grad_norm": 1.4606877565383911, "kl": 0.4996589720249176, "learning_rate": 4.968878624261798e-06, "loss": 0.02, "reward": 1.7512500286102295, "reward_std": 0.460603266954422, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 0.875, "step": 581 }, { "completion_length": 89.0, "epoch": 0.1455, "grad_norm": 1.950114369392395, "kl": 0.1971653401851654, "learning_rate": 4.968534516877279e-06, "loss": 0.0079, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 582 }, { "completion_length": 84.625, "epoch": 0.14575, "grad_norm": 2.8406033515930176, "kl": 0.4174186885356903, "learning_rate": 4.968188529596342e-06, "loss": 0.0167, "reward": 1.34375, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.34375, "rewards/_format_reward": 1.0, "step": 583 }, { "completion_length": 147.75, "epoch": 0.146, "grad_norm": 0.6163609623908997, "kl": 0.10596763342618942, "learning_rate": 4.96784066268247e-06, "loss": 0.0042, "reward": 1.1937499046325684, "reward_std": 0.33320683240890503, "rewards/_accuracy_reward": 0.19374999403953552, "rewards/_format_reward": 1.0, "step": 584 }, { "completion_length": 151.875, "epoch": 0.14625, "grad_norm": 0.24944590032100677, "kl": 0.09916547685861588, "learning_rate": 4.9674909164005805e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 585 }, { "completion_length": 97.25, "epoch": 0.1465, "grad_norm": 0.6661841869354248, "kl": 0.05971316248178482, "learning_rate": 4.967139291017018e-06, "loss": 0.0024, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 586 }, { "completion_length": 177.75, "epoch": 0.14675, "grad_norm": 1.3881531953811646, "kl": 0.18598943948745728, "learning_rate": 4.966785786799564e-06, "loss": 0.0074, "reward": 1.6375000476837158, "reward_std": 0.5005354285240173, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 0.875, "step": 587 }, { "completion_length": 99.375, "epoch": 0.147, "grad_norm": 0.04929376021027565, "kl": 0.08404329419136047, "learning_rate": 4.966430404017424e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 588 }, { "completion_length": 128.125, "epoch": 0.14725, "grad_norm": 0.5995571613311768, "kl": 0.07869472354650497, "learning_rate": 4.966073142941239e-06, "loss": 0.0031, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 589 }, { "completion_length": 123.875, "epoch": 0.1475, "grad_norm": 0.7187158465385437, "kl": 0.11943908035755157, "learning_rate": 4.965714003843079e-06, "loss": 0.0048, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 590 }, { "completion_length": 142.875, "epoch": 0.14775, "grad_norm": 0.6760448217391968, "kl": 0.06530044227838516, "learning_rate": 4.965352986996443e-06, "loss": 0.0026, "reward": 1.65625, "reward_std": 0.48065245151519775, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.75, "step": 591 }, { "completion_length": 110.125, "epoch": 0.148, "grad_norm": 0.5969019532203674, "kl": 0.11454600095748901, "learning_rate": 4.964990092676263e-06, "loss": 0.0046, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 592 }, { "completion_length": 156.25, "epoch": 0.14825, "grad_norm": 0.6029123663902283, "kl": 0.06257858872413635, "learning_rate": 4.964625321158897e-06, "loss": 0.0025, "reward": 1.5, "reward_std": 0.4225771427154541, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.875, "step": 593 }, { "completion_length": 165.875, "epoch": 0.1485, "grad_norm": 0.6441696882247925, "kl": 0.0646340623497963, "learning_rate": 4.964258672722135e-06, "loss": 0.0026, "reward": 1.3825000524520874, "reward_std": 0.7398986220359802, "rewards/_accuracy_reward": 0.6325000524520874, "rewards/_format_reward": 0.75, "step": 594 }, { "completion_length": 129.125, "epoch": 0.14875, "grad_norm": 0.026783820241689682, "kl": 0.057939767837524414, "learning_rate": 4.963890147645195e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 595 }, { "completion_length": 108.0, "epoch": 0.149, "grad_norm": 1.9925416707992554, "kl": 0.1760830134153366, "learning_rate": 4.963519746208726e-06, "loss": 0.007, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 596 }, { "completion_length": 166.25, "epoch": 0.14925, "grad_norm": 1.065826416015625, "kl": 0.05945620313286781, "learning_rate": 4.963147468694804e-06, "loss": 0.0024, "reward": 0.9437500238418579, "reward_std": 0.8537470102310181, "rewards/_accuracy_reward": 0.3187499940395355, "rewards/_format_reward": 0.625, "step": 597 }, { "completion_length": 154.25, "epoch": 0.1495, "grad_norm": 0.8419215679168701, "kl": 0.270252525806427, "learning_rate": 4.962773315386935e-06, "loss": 0.0108, "reward": 1.506250023841858, "reward_std": 0.7513974905014038, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.75, "step": 598 }, { "completion_length": 103.625, "epoch": 0.14975, "grad_norm": 0.9585555195808411, "kl": 0.12021496146917343, "learning_rate": 4.962397286570053e-06, "loss": 0.0048, "reward": 1.506250023841858, "reward_std": 0.4144165813922882, "rewards/_accuracy_reward": 0.5062500238418579, "rewards/_format_reward": 1.0, "step": 599 }, { "completion_length": 206.625, "epoch": 0.15, "grad_norm": 0.8529797792434692, "kl": 0.1137736588716507, "learning_rate": 4.962019382530521e-06, "loss": 0.0046, "reward": 0.7749999761581421, "reward_std": 0.7176349759101868, "rewards/_accuracy_reward": 0.14999999105930328, "rewards/_format_reward": 0.625, "step": 600 }, { "completion_length": 134.0, "epoch": 0.15025, "grad_norm": 0.17384187877178192, "kl": 0.08758542686700821, "learning_rate": 4.961639603556128e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 601 }, { "completion_length": 135.75, "epoch": 0.1505, "grad_norm": 1.2760076522827148, "kl": 0.09961305558681488, "learning_rate": 4.961257949936092e-06, "loss": 0.004, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 602 }, { "completion_length": 163.75, "epoch": 0.15075, "grad_norm": 0.5337422490119934, "kl": 0.06328251212835312, "learning_rate": 4.96087442196106e-06, "loss": 0.0025, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 603 }, { "completion_length": 141.375, "epoch": 0.151, "grad_norm": 1.2539782524108887, "kl": 0.12791743874549866, "learning_rate": 4.960489019923105e-06, "loss": 0.0051, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 604 }, { "completion_length": 172.875, "epoch": 0.15125, "grad_norm": 0.6172487139701843, "kl": 0.07650119066238403, "learning_rate": 4.960101744115727e-06, "loss": 0.0031, "reward": 1.381250023841858, "reward_std": 0.9133679866790771, "rewards/_accuracy_reward": 0.6312500238418579, "rewards/_format_reward": 0.75, "step": 605 }, { "completion_length": 178.0, "epoch": 0.1515, "grad_norm": 0.5221604108810425, "kl": 0.04332815483212471, "learning_rate": 4.959712594833855e-06, "loss": 0.0017, "reward": 1.5750000476837158, "reward_std": 0.4605897068977356, "rewards/_accuracy_reward": 0.574999988079071, "rewards/_format_reward": 1.0, "step": 606 }, { "completion_length": 192.25, "epoch": 0.15175, "grad_norm": 0.47417011857032776, "kl": 0.05454748496413231, "learning_rate": 4.9593215723738405e-06, "loss": 0.0022, "reward": 1.0749999284744263, "reward_std": 0.0707106813788414, "rewards/_accuracy_reward": 0.07500000298023224, "rewards/_format_reward": 1.0, "step": 607 }, { "completion_length": 151.75, "epoch": 0.152, "grad_norm": 0.7074270844459534, "kl": 0.06145765259861946, "learning_rate": 4.958928677033465e-06, "loss": 0.0025, "reward": 1.5499999523162842, "reward_std": 0.4855041801929474, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 608 }, { "completion_length": 92.625, "epoch": 0.15225, "grad_norm": 1.0133639574050903, "kl": 0.0878884345293045, "learning_rate": 4.958533909111936e-06, "loss": 0.0035, "reward": 1.5499999523162842, "reward_std": 0.4855041801929474, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 609 }, { "completion_length": 140.75, "epoch": 0.1525, "grad_norm": 0.5950965881347656, "kl": 0.050494369119405746, "learning_rate": 4.958137268909887e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 610 }, { "completion_length": 121.75, "epoch": 0.15275, "grad_norm": 0.8706890344619751, "kl": 0.07878470420837402, "learning_rate": 4.957738756729375e-06, "loss": 0.0032, "reward": 1.431249976158142, "reward_std": 0.47579821944236755, "rewards/_accuracy_reward": 0.4312500059604645, "rewards/_format_reward": 1.0, "step": 611 }, { "completion_length": 181.25, "epoch": 0.153, "grad_norm": 0.4814571738243103, "kl": 0.05293947085738182, "learning_rate": 4.957338372873886e-06, "loss": 0.0021, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 612 }, { "completion_length": 148.0, "epoch": 0.15325, "grad_norm": 0.7898260354995728, "kl": 0.11220979690551758, "learning_rate": 4.956936117648329e-06, "loss": 0.0045, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 613 }, { "completion_length": 96.875, "epoch": 0.1535, "grad_norm": 0.7721397876739502, "kl": 0.09208115190267563, "learning_rate": 4.956531991359038e-06, "loss": 0.0037, "reward": 1.59375, "reward_std": 0.4419417381286621, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 0.875, "step": 614 }, { "completion_length": 99.625, "epoch": 0.15375, "grad_norm": 0.953384518623352, "kl": 0.07794458419084549, "learning_rate": 4.956125994313775e-06, "loss": 0.0031, "reward": 1.53125, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 1.0, "step": 615 }, { "completion_length": 140.25, "epoch": 0.154, "grad_norm": 0.5639375448226929, "kl": 0.09136466681957245, "learning_rate": 4.9557181268217225e-06, "loss": 0.0037, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 616 }, { "completion_length": 159.125, "epoch": 0.15425, "grad_norm": 0.9821755290031433, "kl": 0.08807244896888733, "learning_rate": 4.955308389193489e-06, "loss": 0.0035, "reward": 1.431249976158142, "reward_std": 0.47579824924468994, "rewards/_accuracy_reward": 0.4312499761581421, "rewards/_format_reward": 1.0, "step": 617 }, { "completion_length": 127.875, "epoch": 0.1545, "grad_norm": 0.900895357131958, "kl": 0.08849354833364487, "learning_rate": 4.95489678174111e-06, "loss": 0.0035, "reward": 1.6875, "reward_std": 0.4381372928619385, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.875, "step": 618 }, { "completion_length": 122.875, "epoch": 0.15475, "grad_norm": 0.10602176189422607, "kl": 0.11338386684656143, "learning_rate": 4.95448330477804e-06, "loss": 0.0045, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 619 }, { "completion_length": 157.625, "epoch": 0.155, "grad_norm": 0.620640218257904, "kl": 0.0592242069542408, "learning_rate": 4.9540679586191605e-06, "loss": 0.0024, "reward": 1.6687500476837158, "reward_std": 0.4613160789012909, "rewards/_accuracy_reward": 0.668749988079071, "rewards/_format_reward": 1.0, "step": 620 }, { "completion_length": 166.0, "epoch": 0.15525, "grad_norm": 0.20956626534461975, "kl": 0.13530687987804413, "learning_rate": 4.953650743580776e-06, "loss": 0.0054, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 621 }, { "completion_length": 140.5, "epoch": 0.1555, "grad_norm": 0.506284236907959, "kl": 0.07134924829006195, "learning_rate": 4.953231659980613e-06, "loss": 0.0029, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 622 }, { "completion_length": 155.125, "epoch": 0.15575, "grad_norm": 1.0659793615341187, "kl": 0.09406277537345886, "learning_rate": 4.952810708137824e-06, "loss": 0.0038, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 623 }, { "completion_length": 134.625, "epoch": 0.156, "grad_norm": 0.7838442921638489, "kl": 0.0715455636382103, "learning_rate": 4.9523878883729794e-06, "loss": 0.0029, "reward": 1.18874990940094, "reward_std": 0.3359607756137848, "rewards/_accuracy_reward": 0.1887499988079071, "rewards/_format_reward": 1.0, "step": 624 }, { "completion_length": 111.0, "epoch": 0.15625, "grad_norm": 0.03516482189297676, "kl": 0.05203777924180031, "learning_rate": 4.9519632010080765e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 625 }, { "completion_length": 116.0, "epoch": 0.1565, "grad_norm": 1.0075751543045044, "kl": 0.12949231266975403, "learning_rate": 4.9515366463665324e-06, "loss": 0.0052, "reward": 1.21875, "reward_std": 0.8284828662872314, "rewards/_accuracy_reward": 0.46875, "rewards/_format_reward": 0.75, "step": 626 }, { "completion_length": 163.25, "epoch": 0.15675, "grad_norm": 0.7406784296035767, "kl": 0.08424215018749237, "learning_rate": 4.951108224773189e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 627 }, { "completion_length": 149.625, "epoch": 0.157, "grad_norm": 0.7964242100715637, "kl": 0.10702201724052429, "learning_rate": 4.9506779365543054e-06, "loss": 0.0043, "reward": 1.5199999809265137, "reward_std": 0.5133086442947388, "rewards/_accuracy_reward": 0.5199999809265137, "rewards/_format_reward": 1.0, "step": 628 }, { "completion_length": 160.75, "epoch": 0.15725, "grad_norm": 0.8447659611701965, "kl": 0.11376137286424637, "learning_rate": 4.950245782037566e-06, "loss": 0.0046, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 629 }, { "completion_length": 140.375, "epoch": 0.1575, "grad_norm": 0.713877260684967, "kl": 0.06790605187416077, "learning_rate": 4.949811761552074e-06, "loss": 0.0027, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 630 }, { "completion_length": 128.875, "epoch": 0.15775, "grad_norm": 0.8207094669342041, "kl": 0.0979921892285347, "learning_rate": 4.9493758754283575e-06, "loss": 0.0039, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 631 }, { "completion_length": 117.875, "epoch": 0.158, "grad_norm": 1.2843239307403564, "kl": 0.11575803905725479, "learning_rate": 4.94893812399836e-06, "loss": 0.0046, "reward": 1.399999976158142, "reward_std": 0.720119059085846, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 0.875, "step": 632 }, { "completion_length": 130.625, "epoch": 0.15825, "grad_norm": 0.7998097538948059, "kl": 0.10970651358366013, "learning_rate": 4.9484985075954505e-06, "loss": 0.0044, "reward": 1.6262500286102295, "reward_std": 0.7428312301635742, "rewards/_accuracy_reward": 0.7512500286102295, "rewards/_format_reward": 0.875, "step": 633 }, { "completion_length": 86.875, "epoch": 0.1585, "grad_norm": 0.2539229393005371, "kl": 0.0996176227927208, "learning_rate": 4.948057026554415e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 634 }, { "completion_length": 79.375, "epoch": 0.15875, "grad_norm": 0.9077969789505005, "kl": 0.09100610762834549, "learning_rate": 4.94761368121146e-06, "loss": 0.0036, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 635 }, { "completion_length": 78.75, "epoch": 0.159, "grad_norm": 0.9096614718437195, "kl": 0.10145445168018341, "learning_rate": 4.947168471904213e-06, "loss": 0.0041, "reward": 1.3875000476837158, "reward_std": 0.3879893124103546, "rewards/_accuracy_reward": 0.38749998807907104, "rewards/_format_reward": 1.0, "step": 636 }, { "completion_length": 131.375, "epoch": 0.15925, "grad_norm": 0.593658983707428, "kl": 0.08535154908895493, "learning_rate": 4.94672139897172e-06, "loss": 0.0034, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 637 }, { "completion_length": 140.125, "epoch": 0.1595, "grad_norm": 0.05550685152411461, "kl": 0.09320636093616486, "learning_rate": 4.946272462754447e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 638 }, { "completion_length": 115.125, "epoch": 0.15975, "grad_norm": 0.0472259521484375, "kl": 0.08565588295459747, "learning_rate": 4.945821663594277e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 639 }, { "completion_length": 109.75, "epoch": 0.16, "grad_norm": 1.4830896854400635, "kl": 0.15049146115779877, "learning_rate": 4.9453690018345144e-06, "loss": 0.006, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 640 }, { "completion_length": 108.0, "epoch": 0.16025, "grad_norm": 0.041197896003723145, "kl": 0.1003151386976242, "learning_rate": 4.944914477819881e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 641 }, { "completion_length": 128.25, "epoch": 0.1605, "grad_norm": 0.6992077827453613, "kl": 0.12279914319515228, "learning_rate": 4.944458091896515e-06, "loss": 0.0049, "reward": 1.3125, "reward_std": 0.4299086630344391, "rewards/_accuracy_reward": 0.3124999701976776, "rewards/_format_reward": 1.0, "step": 642 }, { "completion_length": 58.25, "epoch": 0.16075, "grad_norm": 1.5356488227844238, "kl": 0.13889075815677643, "learning_rate": 4.943999844411978e-06, "loss": 0.0056, "reward": 1.068750023841858, "reward_std": 0.8936032652854919, "rewards/_accuracy_reward": 0.4437499940395355, "rewards/_format_reward": 0.625, "step": 643 }, { "completion_length": 142.875, "epoch": 0.161, "grad_norm": 0.6315851211547852, "kl": 0.07377047091722488, "learning_rate": 4.9435397357152406e-06, "loss": 0.003, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 644 }, { "completion_length": 115.25, "epoch": 0.16125, "grad_norm": 1.5424988269805908, "kl": 0.09540820121765137, "learning_rate": 4.943077766156698e-06, "loss": 0.0038, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 645 }, { "completion_length": 92.875, "epoch": 0.1615, "grad_norm": 0.86021488904953, "kl": 0.13427940011024475, "learning_rate": 4.94261393608816e-06, "loss": 0.0054, "reward": 1.4375, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.4375, "rewards/_format_reward": 1.0, "step": 646 }, { "completion_length": 91.0, "epoch": 0.16175, "grad_norm": 0.054148320108652115, "kl": 0.06728003174066544, "learning_rate": 4.942148245862852e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 647 }, { "completion_length": 95.5, "epoch": 0.162, "grad_norm": 0.03492613136768341, "kl": 0.09255649149417877, "learning_rate": 4.9416806958354206e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 648 }, { "completion_length": 166.5, "epoch": 0.16225, "grad_norm": 0.9836525917053223, "kl": 0.08547134697437286, "learning_rate": 4.941211286361922e-06, "loss": 0.0034, "reward": 1.756250023841858, "reward_std": 0.6894291639328003, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 649 }, { "completion_length": 166.875, "epoch": 0.1625, "grad_norm": 1.5894322395324707, "kl": 0.11034282296895981, "learning_rate": 4.9407400177998335e-06, "loss": 0.0044, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 650 }, { "completion_length": 76.625, "epoch": 0.16275, "grad_norm": 0.8325356841087341, "kl": 0.09968439489603043, "learning_rate": 4.940266890508048e-06, "loss": 0.004, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 651 }, { "completion_length": 84.25, "epoch": 0.163, "grad_norm": 0.9667750000953674, "kl": 0.08716170489788055, "learning_rate": 4.939791904846869e-06, "loss": 0.0035, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 652 }, { "completion_length": 152.5, "epoch": 0.16325, "grad_norm": 0.15192466974258423, "kl": 0.11996634304523468, "learning_rate": 4.9393150611780215e-06, "loss": 0.0048, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 653 }, { "completion_length": 136.5, "epoch": 0.1635, "grad_norm": 0.6134188771247864, "kl": 0.0701427310705185, "learning_rate": 4.938836359864641e-06, "loss": 0.0028, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 654 }, { "completion_length": 169.75, "epoch": 0.16375, "grad_norm": 0.5714187622070312, "kl": 0.0795215517282486, "learning_rate": 4.938355801271282e-06, "loss": 0.0032, "reward": 1.5499999523162842, "reward_std": 0.4855042099952698, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 655 }, { "completion_length": 88.125, "epoch": 0.164, "grad_norm": 1.2346508502960205, "kl": 0.3160145878791809, "learning_rate": 4.937873385763909e-06, "loss": 0.0126, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 656 }, { "completion_length": 86.25, "epoch": 0.16425, "grad_norm": 0.9534627199172974, "kl": 0.07120765745639801, "learning_rate": 4.937389113709902e-06, "loss": 0.0028, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 657 }, { "completion_length": 112.5, "epoch": 0.1645, "grad_norm": 0.7603535652160645, "kl": 0.1263781189918518, "learning_rate": 4.936902985478055e-06, "loss": 0.0051, "reward": 1.1375000476837158, "reward_std": 0.548211395740509, "rewards/_accuracy_reward": 0.26249998807907104, "rewards/_format_reward": 0.875, "step": 658 }, { "completion_length": 124.125, "epoch": 0.16475, "grad_norm": 1.240847110748291, "kl": 0.10992362350225449, "learning_rate": 4.936415001438577e-06, "loss": 0.0044, "reward": 1.537500023841858, "reward_std": 0.7322909235954285, "rewards/_accuracy_reward": 0.6625000238418579, "rewards/_format_reward": 0.875, "step": 659 }, { "completion_length": 127.625, "epoch": 0.165, "grad_norm": 0.9347513914108276, "kl": 0.10446102917194366, "learning_rate": 4.935925161963089e-06, "loss": 0.0042, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.875, "step": 660 }, { "completion_length": 157.5, "epoch": 0.16525, "grad_norm": 1.6045786142349243, "kl": 0.11034439504146576, "learning_rate": 4.935433467424624e-06, "loss": 0.0044, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 661 }, { "completion_length": 123.125, "epoch": 0.1655, "grad_norm": 1.0717341899871826, "kl": 0.10492967069149017, "learning_rate": 4.93493991819763e-06, "loss": 0.0042, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 662 }, { "completion_length": 132.125, "epoch": 0.16575, "grad_norm": 0.7541182637214661, "kl": 0.06509540975093842, "learning_rate": 4.934444514657964e-06, "loss": 0.0026, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 663 }, { "completion_length": 150.375, "epoch": 0.166, "grad_norm": 0.8722965717315674, "kl": 0.09795466810464859, "learning_rate": 4.933947257182901e-06, "loss": 0.0039, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 664 }, { "completion_length": 162.5, "epoch": 0.16625, "grad_norm": 0.7149991393089294, "kl": 0.07934553176164627, "learning_rate": 4.933448146151122e-06, "loss": 0.0032, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.625, "step": 665 }, { "completion_length": 155.5, "epoch": 0.1665, "grad_norm": 1.3303704261779785, "kl": 0.12056293338537216, "learning_rate": 4.932947181942721e-06, "loss": 0.0048, "reward": 1.2950000762939453, "reward_std": 0.6939946413040161, "rewards/_accuracy_reward": 0.41999998688697815, "rewards/_format_reward": 0.875, "step": 666 }, { "completion_length": 183.0, "epoch": 0.16675, "grad_norm": 0.6829738616943359, "kl": 0.09543811529874802, "learning_rate": 4.932444364939205e-06, "loss": 0.0038, "reward": 1.1687500476837158, "reward_std": 0.8314949870109558, "rewards/_accuracy_reward": 0.41874998807907104, "rewards/_format_reward": 0.75, "step": 667 }, { "completion_length": 124.375, "epoch": 0.167, "grad_norm": 1.0810277462005615, "kl": 0.11649039387702942, "learning_rate": 4.9319396955234925e-06, "loss": 0.0047, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 668 }, { "completion_length": 184.875, "epoch": 0.16725, "grad_norm": 1.8912442922592163, "kl": 0.2582206428050995, "learning_rate": 4.9314331740799084e-06, "loss": 0.0103, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 669 }, { "completion_length": 135.875, "epoch": 0.1675, "grad_norm": 0.932590901851654, "kl": 0.13341131806373596, "learning_rate": 4.930924800994192e-06, "loss": 0.0053, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 670 }, { "completion_length": 92.5, "epoch": 0.16775, "grad_norm": 0.3577817976474762, "kl": 0.11805645376443863, "learning_rate": 4.930414576653492e-06, "loss": 0.0047, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 671 }, { "completion_length": 183.25, "epoch": 0.168, "grad_norm": 2.2905547618865967, "kl": 0.2760542035102844, "learning_rate": 4.9299025014463665e-06, "loss": 0.011, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 672 }, { "completion_length": 153.75, "epoch": 0.16825, "grad_norm": 0.8730438947677612, "kl": 0.1262456476688385, "learning_rate": 4.9293885757627815e-06, "loss": 0.005, "reward": 1.6875, "reward_std": 0.4381372928619385, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.875, "step": 673 }, { "completion_length": 141.375, "epoch": 0.1685, "grad_norm": 0.07061377912759781, "kl": 0.09641307592391968, "learning_rate": 4.928872799994116e-06, "loss": 0.0039, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 674 }, { "completion_length": 144.5, "epoch": 0.16875, "grad_norm": 0.8018800020217896, "kl": 0.19013966619968414, "learning_rate": 4.928355174533153e-06, "loss": 0.0076, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 675 }, { "completion_length": 171.625, "epoch": 0.169, "grad_norm": 1.220819115638733, "kl": 0.15096057951450348, "learning_rate": 4.92783569977409e-06, "loss": 0.006, "reward": 1.149999976158142, "reward_std": 0.6358346939086914, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.75, "step": 676 }, { "completion_length": 125.75, "epoch": 0.16925, "grad_norm": 1.0009835958480835, "kl": 0.09804116189479828, "learning_rate": 4.927314376112528e-06, "loss": 0.0039, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 677 }, { "completion_length": 155.875, "epoch": 0.1695, "grad_norm": 0.7764220833778381, "kl": 0.12891684472560883, "learning_rate": 4.926791203945477e-06, "loss": 0.0052, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 678 }, { "completion_length": 139.25, "epoch": 0.16975, "grad_norm": 0.6513742804527283, "kl": 0.05859963223338127, "learning_rate": 4.926266183671356e-06, "loss": 0.0023, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 679 }, { "completion_length": 82.25, "epoch": 0.17, "grad_norm": 1.309946894645691, "kl": 0.12626594305038452, "learning_rate": 4.925739315689991e-06, "loss": 0.0051, "reward": 1.412500023841858, "reward_std": 0.3691205680370331, "rewards/_accuracy_reward": 0.4124999940395355, "rewards/_format_reward": 1.0, "step": 680 }, { "completion_length": 107.75, "epoch": 0.17025, "grad_norm": 1.0792083740234375, "kl": 0.1906372606754303, "learning_rate": 4.925210600402615e-06, "loss": 0.0076, "reward": 1.537500023841858, "reward_std": 0.7322909235954285, "rewards/_accuracy_reward": 0.6625000238418579, "rewards/_format_reward": 0.875, "step": 681 }, { "completion_length": 143.25, "epoch": 0.1705, "grad_norm": 0.5776306986808777, "kl": 0.05421634390950203, "learning_rate": 4.924680038211868e-06, "loss": 0.0022, "reward": 1.7825000286102295, "reward_std": 0.40780770778656006, "rewards/_accuracy_reward": 0.7825000286102295, "rewards/_format_reward": 1.0, "step": 682 }, { "completion_length": 99.625, "epoch": 0.17075, "grad_norm": 1.0922069549560547, "kl": 0.16642846167087555, "learning_rate": 4.924147629521794e-06, "loss": 0.0067, "reward": 1.4500000476837158, "reward_std": 0.6979562044143677, "rewards/_accuracy_reward": 0.574999988079071, "rewards/_format_reward": 0.875, "step": 683 }, { "completion_length": 157.375, "epoch": 0.171, "grad_norm": 0.7435649633407593, "kl": 0.08429885655641556, "learning_rate": 4.923613374737848e-06, "loss": 0.0034, "reward": 1.306249976158142, "reward_std": 0.4346078038215637, "rewards/_accuracy_reward": 0.4312499761581421, "rewards/_format_reward": 0.875, "step": 684 }, { "completion_length": 140.0, "epoch": 0.17125, "grad_norm": 0.8662862181663513, "kl": 0.06566808372735977, "learning_rate": 4.923077274266886e-06, "loss": 0.0026, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 685 }, { "completion_length": 148.0, "epoch": 0.1715, "grad_norm": 1.3568053245544434, "kl": 0.11993306875228882, "learning_rate": 4.922539328517174e-06, "loss": 0.0048, "reward": 1.7512500286102295, "reward_std": 0.460603266954422, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 0.875, "step": 686 }, { "completion_length": 116.75, "epoch": 0.17175, "grad_norm": 0.056372012943029404, "kl": 0.11503936350345612, "learning_rate": 4.92199953789838e-06, "loss": 0.0046, "reward": 1.25, "reward_std": 0.0, "rewards/_accuracy_reward": 0.25, "rewards/_format_reward": 1.0, "step": 687 }, { "completion_length": 110.5, "epoch": 0.172, "grad_norm": 0.8631861209869385, "kl": 0.07430507987737656, "learning_rate": 4.921457902821578e-06, "loss": 0.003, "reward": 1.1937499046325684, "reward_std": 0.33320683240890503, "rewards/_accuracy_reward": 0.19374999403953552, "rewards/_format_reward": 1.0, "step": 688 }, { "completion_length": 85.875, "epoch": 0.17225, "grad_norm": 0.04333508759737015, "kl": 0.11433293670415878, "learning_rate": 4.920914423699247e-06, "loss": 0.0046, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 689 }, { "completion_length": 159.75, "epoch": 0.1725, "grad_norm": 0.737657368183136, "kl": 0.07528231292963028, "learning_rate": 4.92036910094527e-06, "loss": 0.003, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 690 }, { "completion_length": 120.25, "epoch": 0.17275, "grad_norm": 0.8405797481536865, "kl": 0.07952480018138885, "learning_rate": 4.919821934974933e-06, "loss": 0.0032, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 691 }, { "completion_length": 132.5, "epoch": 0.173, "grad_norm": 0.6791375279426575, "kl": 0.0799427479505539, "learning_rate": 4.9192729262049285e-06, "loss": 0.0032, "reward": 1.7575000524520874, "reward_std": 0.449150025844574, "rewards/_accuracy_reward": 0.7574999928474426, "rewards/_format_reward": 1.0, "step": 692 }, { "completion_length": 122.875, "epoch": 0.17325, "grad_norm": 0.7748632431030273, "kl": 0.11183080077171326, "learning_rate": 4.918722075053349e-06, "loss": 0.0045, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 693 }, { "completion_length": 100.375, "epoch": 0.1735, "grad_norm": 0.9572948217391968, "kl": 0.0853419229388237, "learning_rate": 4.918169381939693e-06, "loss": 0.0034, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 694 }, { "completion_length": 158.875, "epoch": 0.17375, "grad_norm": 0.7936824560165405, "kl": 0.062092866748571396, "learning_rate": 4.917614847284858e-06, "loss": 0.0025, "reward": 1.3125, "reward_std": 0.4299086630344391, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 1.0, "step": 695 }, { "completion_length": 147.5, "epoch": 0.174, "grad_norm": 0.05352199077606201, "kl": 0.0950603038072586, "learning_rate": 4.917058471511149e-06, "loss": 0.0038, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 696 }, { "completion_length": 189.875, "epoch": 0.17425, "grad_norm": 0.8454087972640991, "kl": 0.14175119996070862, "learning_rate": 4.916500255042269e-06, "loss": 0.0057, "reward": 1.381250023841858, "reward_std": 0.9133679866790771, "rewards/_accuracy_reward": 0.6312500238418579, "rewards/_format_reward": 0.75, "step": 697 }, { "completion_length": 138.125, "epoch": 0.1745, "grad_norm": 0.7943523526191711, "kl": 0.09450940042734146, "learning_rate": 4.915940198303324e-06, "loss": 0.0038, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 698 }, { "completion_length": 139.375, "epoch": 0.17475, "grad_norm": 0.6788039803504944, "kl": 0.06518439203500748, "learning_rate": 4.915378301720822e-06, "loss": 0.0026, "reward": 1.3875000476837158, "reward_std": 0.3879893124103546, "rewards/_accuracy_reward": 0.38749998807907104, "rewards/_format_reward": 1.0, "step": 699 }, { "completion_length": 116.875, "epoch": 0.175, "grad_norm": 0.8311275243759155, "kl": 0.08171775192022324, "learning_rate": 4.914814565722671e-06, "loss": 0.0033, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 700 }, { "completion_length": 150.0, "epoch": 0.17525, "grad_norm": 0.5767417550086975, "kl": 0.06639153510332108, "learning_rate": 4.914248990738182e-06, "loss": 0.0027, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 701 }, { "completion_length": 165.25, "epoch": 0.1755, "grad_norm": 0.6517013311386108, "kl": 0.1013181060552597, "learning_rate": 4.913681577198063e-06, "loss": 0.0041, "reward": 1.1624999046325684, "reward_std": 0.338853120803833, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 0.875, "step": 702 }, { "completion_length": 113.625, "epoch": 0.17575, "grad_norm": 0.9347190260887146, "kl": 0.09112431108951569, "learning_rate": 4.913112325534426e-06, "loss": 0.0036, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 703 }, { "completion_length": 97.75, "epoch": 0.176, "grad_norm": 0.05434059351682663, "kl": 0.09588680416345596, "learning_rate": 4.912541236180779e-06, "loss": 0.0038, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 704 }, { "completion_length": 149.625, "epoch": 0.17625, "grad_norm": 0.0285491980612278, "kl": 0.0574335977435112, "learning_rate": 4.9119683095720325e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 705 }, { "completion_length": 130.625, "epoch": 0.1765, "grad_norm": 0.8181139826774597, "kl": 0.06118696555495262, "learning_rate": 4.9113935461444955e-06, "loss": 0.0024, "reward": 1.6637499332427979, "reward_std": 0.4691310524940491, "rewards/_accuracy_reward": 0.6637499928474426, "rewards/_format_reward": 1.0, "step": 706 }, { "completion_length": 139.75, "epoch": 0.17675, "grad_norm": 0.4688571095466614, "kl": 0.058115698397159576, "learning_rate": 4.910816946335875e-06, "loss": 0.0023, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 707 }, { "completion_length": 164.0, "epoch": 0.177, "grad_norm": 0.6474580764770508, "kl": 0.16749340295791626, "learning_rate": 4.910238510585275e-06, "loss": 0.0067, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.518750011920929, "rewards/_format_reward": 0.875, "step": 708 }, { "completion_length": 174.25, "epoch": 0.17725, "grad_norm": 0.6559945940971375, "kl": 0.11057315766811371, "learning_rate": 4.909658239333203e-06, "loss": 0.0044, "reward": 1.2687499523162842, "reward_std": 0.699968159198761, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.75, "step": 709 }, { "completion_length": 160.25, "epoch": 0.1775, "grad_norm": 0.049524981528520584, "kl": 0.06787683814764023, "learning_rate": 4.909076133021558e-06, "loss": 0.0027, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 710 }, { "completion_length": 180.5, "epoch": 0.17775, "grad_norm": 0.7570251822471619, "kl": 0.08942964673042297, "learning_rate": 4.9084921920936405e-06, "loss": 0.0036, "reward": 0.9187499284744263, "reward_std": 0.3712310194969177, "rewards/_accuracy_reward": 0.04374999925494194, "rewards/_format_reward": 0.875, "step": 711 }, { "completion_length": 163.625, "epoch": 0.178, "grad_norm": 0.6288597583770752, "kl": 0.07151935994625092, "learning_rate": 4.907906416994146e-06, "loss": 0.0029, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 712 }, { "completion_length": 132.625, "epoch": 0.17825, "grad_norm": 0.044216256588697433, "kl": 0.07765813916921616, "learning_rate": 4.907318808169168e-06, "loss": 0.0031, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 713 }, { "completion_length": 190.0, "epoch": 0.1785, "grad_norm": 0.9514076709747314, "kl": 0.0776422843337059, "learning_rate": 4.906729366066197e-06, "loss": 0.0031, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.625, "step": 714 }, { "completion_length": 82.625, "epoch": 0.17875, "grad_norm": 0.046869371086359024, "kl": 0.06022655963897705, "learning_rate": 4.906138091134118e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 715 }, { "completion_length": 141.625, "epoch": 0.179, "grad_norm": 1.30816650390625, "kl": 0.0730072632431984, "learning_rate": 4.905544983823214e-06, "loss": 0.0029, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 716 }, { "completion_length": 109.5, "epoch": 0.17925, "grad_norm": 0.7696007490158081, "kl": 0.08097223192453384, "learning_rate": 4.904950044585159e-06, "loss": 0.0032, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 717 }, { "completion_length": 163.0, "epoch": 0.1795, "grad_norm": 0.674775242805481, "kl": 0.06911822408437729, "learning_rate": 4.904353273873029e-06, "loss": 0.0028, "reward": 1.28125, "reward_std": 0.6844902038574219, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 0.875, "step": 718 }, { "completion_length": 176.25, "epoch": 0.17975, "grad_norm": 0.5623289346694946, "kl": 0.05954327434301376, "learning_rate": 4.903754672141288e-06, "loss": 0.0024, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 719 }, { "completion_length": 147.5, "epoch": 0.18, "grad_norm": 0.7491151690483093, "kl": 0.08063170313835144, "learning_rate": 4.903154239845798e-06, "loss": 0.0032, "reward": 1.4212499856948853, "reward_std": 0.485222727060318, "rewards/_accuracy_reward": 0.42124998569488525, "rewards/_format_reward": 1.0, "step": 720 }, { "completion_length": 147.625, "epoch": 0.18025, "grad_norm": 0.0454736165702343, "kl": 0.06635451316833496, "learning_rate": 4.902551977443813e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 721 }, { "completion_length": 196.875, "epoch": 0.1805, "grad_norm": 0.5895596146583557, "kl": 0.11441276967525482, "learning_rate": 4.901947885393986e-06, "loss": 0.0046, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 722 }, { "completion_length": 169.0, "epoch": 0.18075, "grad_norm": 0.07282302528619766, "kl": 0.09880199283361435, "learning_rate": 4.901341964156356e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 723 }, { "completion_length": 109.875, "epoch": 0.181, "grad_norm": 0.036335721611976624, "kl": 0.05390893295407295, "learning_rate": 4.900734214192358e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 724 }, { "completion_length": 130.0, "epoch": 0.18125, "grad_norm": 0.9860284924507141, "kl": 0.10044866800308228, "learning_rate": 4.900124635964823e-06, "loss": 0.004, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 725 }, { "completion_length": 162.625, "epoch": 0.1815, "grad_norm": 0.8596735596656799, "kl": 0.09692024439573288, "learning_rate": 4.899513229937968e-06, "loss": 0.0039, "reward": 1.6375000476837158, "reward_std": 0.7224709987640381, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 0.875, "step": 726 }, { "completion_length": 102.25, "epoch": 0.18175, "grad_norm": 0.039204664528369904, "kl": 0.07592302560806274, "learning_rate": 4.898899996577407e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 727 }, { "completion_length": 131.5, "epoch": 0.182, "grad_norm": 0.8768433332443237, "kl": 0.07269235700368881, "learning_rate": 4.898284936350144e-06, "loss": 0.0029, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 728 }, { "completion_length": 108.5, "epoch": 0.18225, "grad_norm": 0.023332836106419563, "kl": 0.03946740925312042, "learning_rate": 4.897668049724574e-06, "loss": 0.0016, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 729 }, { "completion_length": 187.0, "epoch": 0.1825, "grad_norm": 0.7027626633644104, "kl": 0.09945539385080338, "learning_rate": 4.897049337170483e-06, "loss": 0.004, "reward": 1.181249976158142, "reward_std": 0.6335486769676208, "rewards/_accuracy_reward": 0.3062499761581421, "rewards/_format_reward": 0.875, "step": 730 }, { "completion_length": 167.375, "epoch": 0.18275, "grad_norm": 0.029858523979783058, "kl": 0.04215020686388016, "learning_rate": 4.896428799159048e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 731 }, { "completion_length": 108.625, "epoch": 0.183, "grad_norm": 0.8350287675857544, "kl": 0.058248016983270645, "learning_rate": 4.8958064361628334e-06, "loss": 0.0023, "reward": 1.2000000476837158, "reward_std": 0.09258202463388443, "rewards/_accuracy_reward": 0.20000000298023224, "rewards/_format_reward": 1.0, "step": 732 }, { "completion_length": 153.25, "epoch": 0.18325, "grad_norm": 0.8019189238548279, "kl": 0.06479734927415848, "learning_rate": 4.8951822486557985e-06, "loss": 0.0026, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 733 }, { "completion_length": 117.0, "epoch": 0.1835, "grad_norm": 0.0786304697394371, "kl": 0.07490724325180054, "learning_rate": 4.894556237113287e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 734 }, { "completion_length": 153.25, "epoch": 0.18375, "grad_norm": 0.7346017360687256, "kl": 0.0502316989004612, "learning_rate": 4.8939284020120365e-06, "loss": 0.002, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 735 }, { "completion_length": 113.625, "epoch": 0.184, "grad_norm": 0.7519781589508057, "kl": 0.06321275234222412, "learning_rate": 4.893298743830168e-06, "loss": 0.0025, "reward": 1.4500000476837158, "reward_std": 0.46445053815841675, "rewards/_accuracy_reward": 0.574999988079071, "rewards/_format_reward": 0.875, "step": 736 }, { "completion_length": 176.75, "epoch": 0.18425, "grad_norm": 0.5180202126502991, "kl": 0.06774277240037918, "learning_rate": 4.892667263047196e-06, "loss": 0.0027, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 737 }, { "completion_length": 188.75, "epoch": 0.1845, "grad_norm": 0.5825796723365784, "kl": 0.05548809841275215, "learning_rate": 4.89203396014402e-06, "loss": 0.0022, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 738 }, { "completion_length": 146.625, "epoch": 0.18475, "grad_norm": 0.9164865016937256, "kl": 0.07509761303663254, "learning_rate": 4.891398835602925e-06, "loss": 0.003, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 739 }, { "completion_length": 140.125, "epoch": 0.185, "grad_norm": 0.8452046513557434, "kl": 0.054703302681446075, "learning_rate": 4.890761889907589e-06, "loss": 0.0022, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 740 }, { "completion_length": 99.625, "epoch": 0.18525, "grad_norm": 0.051303569227457047, "kl": 0.08967574685811996, "learning_rate": 4.890123123543074e-06, "loss": 0.0036, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 741 }, { "completion_length": 142.875, "epoch": 0.1855, "grad_norm": 0.7844648361206055, "kl": 0.057571351528167725, "learning_rate": 4.889482536995826e-06, "loss": 0.0023, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 742 }, { "completion_length": 126.5, "epoch": 0.18575, "grad_norm": 0.8076685667037964, "kl": 0.07212512940168381, "learning_rate": 4.888840130753681e-06, "loss": 0.0029, "reward": 1.0374999046325684, "reward_std": 0.5350233912467957, "rewards/_accuracy_reward": 0.16249999403953552, "rewards/_format_reward": 0.875, "step": 743 }, { "completion_length": 102.125, "epoch": 0.186, "grad_norm": 0.02857026271522045, "kl": 0.045980606228113174, "learning_rate": 4.888195905305859e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 744 }, { "completion_length": 102.625, "epoch": 0.18625, "grad_norm": 0.04947218671441078, "kl": 0.08267652988433838, "learning_rate": 4.887549861142967e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 745 }, { "completion_length": 99.5, "epoch": 0.1865, "grad_norm": 1.0107591152191162, "kl": 0.07869725674390793, "learning_rate": 4.886901998756995e-06, "loss": 0.0031, "reward": 1.34375, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.34375, "rewards/_format_reward": 1.0, "step": 746 }, { "completion_length": 148.0, "epoch": 0.18675, "grad_norm": 0.8579306602478027, "kl": 0.06560367345809937, "learning_rate": 4.886252318641316e-06, "loss": 0.0026, "reward": 1.125, "reward_std": 0.10350988060235977, "rewards/_accuracy_reward": 0.125, "rewards/_format_reward": 1.0, "step": 747 }, { "completion_length": 170.875, "epoch": 0.187, "grad_norm": 0.5661394596099854, "kl": 0.08825960755348206, "learning_rate": 4.885600821290692e-06, "loss": 0.0035, "reward": 1.274999976158142, "reward_std": 0.6974443197250366, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.875, "step": 748 }, { "completion_length": 137.0, "epoch": 0.18725, "grad_norm": 0.8725441694259644, "kl": 0.048608239740133286, "learning_rate": 4.884947507201268e-06, "loss": 0.0019, "reward": 1.0437499284744263, "reward_std": 0.5212878584861755, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 0.875, "step": 749 }, { "completion_length": 85.125, "epoch": 0.1875, "grad_norm": 1.9548423290252686, "kl": 0.10041142255067825, "learning_rate": 4.884292376870567e-06, "loss": 0.004, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 750 }, { "completion_length": 143.0, "epoch": 0.18775, "grad_norm": 0.8026706576347351, "kl": 0.060220833867788315, "learning_rate": 4.883635430797503e-06, "loss": 0.0024, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 751 }, { "completion_length": 151.125, "epoch": 0.188, "grad_norm": 0.5478430986404419, "kl": 0.051096536219120026, "learning_rate": 4.882976669482368e-06, "loss": 0.002, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 752 }, { "completion_length": 117.5, "epoch": 0.18825, "grad_norm": 0.6958257555961609, "kl": 0.048697832971811295, "learning_rate": 4.8823160934268365e-06, "loss": 0.0019, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 753 }, { "completion_length": 147.25, "epoch": 0.1885, "grad_norm": 0.6645660996437073, "kl": 0.0725683867931366, "learning_rate": 4.881653703133966e-06, "loss": 0.0029, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.875, "step": 754 }, { "completion_length": 147.125, "epoch": 0.18875, "grad_norm": 0.639782726764679, "kl": 0.047644682228565216, "learning_rate": 4.880989499108196e-06, "loss": 0.0019, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 1.0, "step": 755 }, { "completion_length": 176.875, "epoch": 0.189, "grad_norm": 0.6647182106971741, "kl": 0.07348424941301346, "learning_rate": 4.880323481855347e-06, "loss": 0.0029, "reward": 1.6324999332427979, "reward_std": 0.7272404432296753, "rewards/_accuracy_reward": 0.7575000524520874, "rewards/_format_reward": 0.875, "step": 756 }, { "completion_length": 144.5, "epoch": 0.18925, "grad_norm": 0.04022838547825813, "kl": 0.06717957556247711, "learning_rate": 4.8796556518826196e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 757 }, { "completion_length": 170.625, "epoch": 0.1895, "grad_norm": 0.6884375810623169, "kl": 0.04896247014403343, "learning_rate": 4.878986009698596e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 758 }, { "completion_length": 153.5, "epoch": 0.18975, "grad_norm": 0.7841867804527283, "kl": 0.07170876115560532, "learning_rate": 4.878314555813237e-06, "loss": 0.0029, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 759 }, { "completion_length": 141.0, "epoch": 0.19, "grad_norm": 0.7056874632835388, "kl": 0.0750083476305008, "learning_rate": 4.8776412907378845e-06, "loss": 0.003, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 760 }, { "completion_length": 130.25, "epoch": 0.19025, "grad_norm": 0.884601354598999, "kl": 0.0460171103477478, "learning_rate": 4.876966214985259e-06, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 761 }, { "completion_length": 107.625, "epoch": 0.1905, "grad_norm": 0.6765369176864624, "kl": 0.06972219794988632, "learning_rate": 4.87628932906946e-06, "loss": 0.0028, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 762 }, { "completion_length": 145.625, "epoch": 0.19075, "grad_norm": 0.043741121888160706, "kl": 0.06062021851539612, "learning_rate": 4.875610633505965e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 763 }, { "completion_length": 131.375, "epoch": 0.191, "grad_norm": 0.5471161603927612, "kl": 0.03881002590060234, "learning_rate": 4.874930128811631e-06, "loss": 0.0016, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 764 }, { "completion_length": 91.25, "epoch": 0.19125, "grad_norm": 0.7647479176521301, "kl": 0.0523533895611763, "learning_rate": 4.874247815504693e-06, "loss": 0.0021, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 765 }, { "completion_length": 160.5, "epoch": 0.1915, "grad_norm": 0.6481146216392517, "kl": 0.052776042371988297, "learning_rate": 4.87356369410476e-06, "loss": 0.0021, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.875, "step": 766 }, { "completion_length": 165.75, "epoch": 0.19175, "grad_norm": 0.790132999420166, "kl": 0.07757475972175598, "learning_rate": 4.872877765132822e-06, "loss": 0.0031, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 767 }, { "completion_length": 214.125, "epoch": 0.192, "grad_norm": 0.4553297460079193, "kl": 0.06625451147556305, "learning_rate": 4.8721900291112415e-06, "loss": 0.0027, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 768 }, { "completion_length": 136.625, "epoch": 0.19225, "grad_norm": 0.7693415880203247, "kl": 0.06646943092346191, "learning_rate": 4.8715004865637616e-06, "loss": 0.0027, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 769 }, { "completion_length": 171.875, "epoch": 0.1925, "grad_norm": 0.7193809747695923, "kl": 0.050990864634513855, "learning_rate": 4.870809138015499e-06, "loss": 0.002, "reward": 1.2999999523162842, "reward_std": 0.6917885541915894, "rewards/_accuracy_reward": 0.42499998211860657, "rewards/_format_reward": 0.875, "step": 770 }, { "completion_length": 159.0, "epoch": 0.19275, "grad_norm": 0.8034641742706299, "kl": 0.056241609156131744, "learning_rate": 4.870115983992944e-06, "loss": 0.0022, "reward": 1.7825000286102295, "reward_std": 0.40780770778656006, "rewards/_accuracy_reward": 0.7825000286102295, "rewards/_format_reward": 1.0, "step": 771 }, { "completion_length": 180.5, "epoch": 0.193, "grad_norm": 0.02887933887541294, "kl": 0.05732205510139465, "learning_rate": 4.869421025023965e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 772 }, { "completion_length": 177.25, "epoch": 0.19325, "grad_norm": 0.03681923449039459, "kl": 0.05520891398191452, "learning_rate": 4.8687242616378026e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 773 }, { "completion_length": 145.5, "epoch": 0.1935, "grad_norm": 0.7011691331863403, "kl": 0.060724180191755295, "learning_rate": 4.868025694365073e-06, "loss": 0.0024, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 774 }, { "completion_length": 153.875, "epoch": 0.19375, "grad_norm": 0.8208072185516357, "kl": 0.06594527512788773, "learning_rate": 4.867325323737765e-06, "loss": 0.0026, "reward": 1.662500023841858, "reward_std": 0.7024192810058594, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 0.875, "step": 775 }, { "completion_length": 73.5, "epoch": 0.194, "grad_norm": 1.4499397277832031, "kl": 0.07009965926408768, "learning_rate": 4.866623150289241e-06, "loss": 0.0028, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 776 }, { "completion_length": 145.875, "epoch": 0.19425, "grad_norm": 0.7058715224266052, "kl": 0.04987990856170654, "learning_rate": 4.865919174554238e-06, "loss": 0.002, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 777 }, { "completion_length": 171.375, "epoch": 0.1945, "grad_norm": 0.552635908126831, "kl": 0.06489388644695282, "learning_rate": 4.865213397068864e-06, "loss": 0.0026, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.2812499701976776, "rewards/_format_reward": 0.875, "step": 778 }, { "completion_length": 115.5, "epoch": 0.19475, "grad_norm": 0.7429036498069763, "kl": 0.03541000187397003, "learning_rate": 4.8645058183705976e-06, "loss": 0.0014, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 779 }, { "completion_length": 139.375, "epoch": 0.195, "grad_norm": 0.6708275675773621, "kl": 0.0737345740199089, "learning_rate": 4.863796438998293e-06, "loss": 0.0029, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 780 }, { "completion_length": 80.625, "epoch": 0.19525, "grad_norm": 1.0399796962738037, "kl": 0.06832250952720642, "learning_rate": 4.863085259492171e-06, "loss": 0.0027, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 781 }, { "completion_length": 84.5, "epoch": 0.1955, "grad_norm": 0.02351105399429798, "kl": 0.053186241537332535, "learning_rate": 4.862372280393828e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 782 }, { "completion_length": 115.125, "epoch": 0.19575, "grad_norm": 0.7121822834014893, "kl": 0.07135632634162903, "learning_rate": 4.861657502246226e-06, "loss": 0.0029, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 783 }, { "completion_length": 126.625, "epoch": 0.196, "grad_norm": 0.8840348720550537, "kl": 0.03291170299053192, "learning_rate": 4.860940925593703e-06, "loss": 0.0013, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 784 }, { "completion_length": 190.625, "epoch": 0.19625, "grad_norm": 0.5434854626655579, "kl": 0.04633787274360657, "learning_rate": 4.860222550981961e-06, "loss": 0.0019, "reward": 1.3125, "reward_std": 0.873723566532135, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.75, "step": 785 }, { "completion_length": 138.25, "epoch": 0.1965, "grad_norm": 0.6125333309173584, "kl": 0.04772162437438965, "learning_rate": 4.8595023789580745e-06, "loss": 0.0019, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 786 }, { "completion_length": 160.375, "epoch": 0.19675, "grad_norm": 0.7128032445907593, "kl": 0.07775954157114029, "learning_rate": 4.858780410070484e-06, "loss": 0.0031, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.643750011920929, "rewards/_format_reward": 1.0, "step": 787 }, { "completion_length": 172.125, "epoch": 0.197, "grad_norm": 0.9440908432006836, "kl": 0.06862717866897583, "learning_rate": 4.858056644869002e-06, "loss": 0.0027, "reward": 1.1437499523162842, "reward_std": 0.8317097425460815, "rewards/_accuracy_reward": 0.39374998211860657, "rewards/_format_reward": 0.75, "step": 788 }, { "completion_length": 198.25, "epoch": 0.19725, "grad_norm": 0.6834743618965149, "kl": 0.055516257882118225, "learning_rate": 4.8573310839048085e-06, "loss": 0.0022, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 789 }, { "completion_length": 172.125, "epoch": 0.1975, "grad_norm": 0.5438332557678223, "kl": 0.07543152570724487, "learning_rate": 4.856603727730446e-06, "loss": 0.003, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 790 }, { "completion_length": 134.125, "epoch": 0.19775, "grad_norm": 0.6861331462860107, "kl": 0.06321967393159866, "learning_rate": 4.855874576899831e-06, "loss": 0.0025, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 1.0, "step": 791 }, { "completion_length": 104.75, "epoch": 0.198, "grad_norm": 1.0154536962509155, "kl": 0.08728273957967758, "learning_rate": 4.855143631968242e-06, "loss": 0.0035, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 792 }, { "completion_length": 132.0, "epoch": 0.19825, "grad_norm": 0.7148137092590332, "kl": 0.09352617710828781, "learning_rate": 4.854410893492326e-06, "loss": 0.0037, "reward": 1.4375, "reward_std": 0.8530408143997192, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.75, "step": 793 }, { "completion_length": 136.25, "epoch": 0.1985, "grad_norm": 0.6906197667121887, "kl": 0.056963000446558, "learning_rate": 4.853676362030095e-06, "loss": 0.0023, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 794 }, { "completion_length": 176.5, "epoch": 0.19875, "grad_norm": 0.04440414160490036, "kl": 0.08904334902763367, "learning_rate": 4.852940038140927e-06, "loss": 0.0036, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 795 }, { "completion_length": 120.375, "epoch": 0.199, "grad_norm": 0.07577986270189285, "kl": 0.07960440963506699, "learning_rate": 4.852201922385564e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 796 }, { "completion_length": 169.625, "epoch": 0.19925, "grad_norm": 0.6561159491539001, "kl": 0.06286200135946274, "learning_rate": 4.851462015326114e-06, "loss": 0.0025, "reward": 1.502500057220459, "reward_std": 0.7540509700775146, "rewards/_accuracy_reward": 0.627500057220459, "rewards/_format_reward": 0.875, "step": 797 }, { "completion_length": 117.75, "epoch": 0.1995, "grad_norm": 0.6921773552894592, "kl": 0.03880568593740463, "learning_rate": 4.850720317526047e-06, "loss": 0.0016, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 798 }, { "completion_length": 174.125, "epoch": 0.19975, "grad_norm": 0.025037772953510284, "kl": 0.05044008791446686, "learning_rate": 4.8499768295502e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 799 }, { "completion_length": 117.25, "epoch": 0.2, "grad_norm": 0.742163896560669, "kl": 0.08251883089542389, "learning_rate": 4.849231551964771e-06, "loss": 0.0033, "reward": 1.6887500286102295, "reward_std": 0.43590423464775085, "rewards/_accuracy_reward": 0.6887500286102295, "rewards/_format_reward": 1.0, "step": 800 }, { "completion_length": 134.5, "epoch": 0.20025, "grad_norm": 0.8082313537597656, "kl": 0.06866247951984406, "learning_rate": 4.8484844853373205e-06, "loss": 0.0027, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 801 }, { "completion_length": 140.375, "epoch": 0.2005, "grad_norm": 0.04818421229720116, "kl": 0.07798528671264648, "learning_rate": 4.847735630236773e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 802 }, { "completion_length": 136.625, "epoch": 0.20075, "grad_norm": 0.741947591304779, "kl": 0.07387517392635345, "learning_rate": 4.846984987233414e-06, "loss": 0.003, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 803 }, { "completion_length": 142.625, "epoch": 0.201, "grad_norm": 0.862678050994873, "kl": 0.06001199409365654, "learning_rate": 4.84623255689889e-06, "loss": 0.0024, "reward": 1.4187500476837158, "reward_std": 0.7235515117645264, "rewards/_accuracy_reward": 0.543749988079071, "rewards/_format_reward": 0.875, "step": 804 }, { "completion_length": 156.5, "epoch": 0.20125, "grad_norm": 0.6630946397781372, "kl": 0.06096571311354637, "learning_rate": 4.845478339806211e-06, "loss": 0.0024, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 805 }, { "completion_length": 128.875, "epoch": 0.2015, "grad_norm": 0.6322165131568909, "kl": 0.054042182862758636, "learning_rate": 4.844722336529745e-06, "loss": 0.0022, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 806 }, { "completion_length": 116.125, "epoch": 0.20175, "grad_norm": 0.06692986190319061, "kl": 0.0923774242401123, "learning_rate": 4.843964547645221e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 807 }, { "completion_length": 110.75, "epoch": 0.202, "grad_norm": 1.2808027267456055, "kl": 0.07200721651315689, "learning_rate": 4.84320497372973e-06, "loss": 0.0029, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 808 }, { "completion_length": 153.25, "epoch": 0.20225, "grad_norm": 0.6430820822715759, "kl": 0.061430469155311584, "learning_rate": 4.842443615361718e-06, "loss": 0.0025, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 809 }, { "completion_length": 117.125, "epoch": 0.2025, "grad_norm": 0.9564692974090576, "kl": 0.09131399542093277, "learning_rate": 4.841680473120994e-06, "loss": 0.0037, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 810 }, { "completion_length": 163.375, "epoch": 0.20275, "grad_norm": 0.6316709518432617, "kl": 0.05354610085487366, "learning_rate": 4.840915547588725e-06, "loss": 0.0021, "reward": 1.3125, "reward_std": 0.4299086630344391, "rewards/_accuracy_reward": 0.3124999701976776, "rewards/_format_reward": 1.0, "step": 811 }, { "completion_length": 169.5, "epoch": 0.203, "grad_norm": 0.6619630455970764, "kl": 0.07358560711145401, "learning_rate": 4.840148839347434e-06, "loss": 0.0029, "reward": 1.287500023841858, "reward_std": 0.8786150217056274, "rewards/_accuracy_reward": 0.5375000238418579, "rewards/_format_reward": 0.75, "step": 812 }, { "completion_length": 161.875, "epoch": 0.20325, "grad_norm": 0.6946004629135132, "kl": 0.06069519370794296, "learning_rate": 4.839380348981002e-06, "loss": 0.0024, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 813 }, { "completion_length": 120.0, "epoch": 0.2035, "grad_norm": 0.7291589379310608, "kl": 0.07319469004869461, "learning_rate": 4.838610077074669e-06, "loss": 0.0029, "reward": 1.412500023841858, "reward_std": 0.7273975014686584, "rewards/_accuracy_reward": 0.6625000238418579, "rewards/_format_reward": 0.75, "step": 814 }, { "completion_length": 155.75, "epoch": 0.20375, "grad_norm": 0.6683049201965332, "kl": 0.06487289816141129, "learning_rate": 4.83783802421503e-06, "loss": 0.0026, "reward": 1.7575000524520874, "reward_std": 0.449150025844574, "rewards/_accuracy_reward": 0.7574999928474426, "rewards/_format_reward": 1.0, "step": 815 }, { "completion_length": 60.625, "epoch": 0.204, "grad_norm": 1.0868297815322876, "kl": 0.07810930162668228, "learning_rate": 4.837064190990036e-06, "loss": 0.0031, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 816 }, { "completion_length": 153.375, "epoch": 0.20425, "grad_norm": 0.03452404588460922, "kl": 0.06881213933229446, "learning_rate": 4.836288577988997e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 817 }, { "completion_length": 114.0, "epoch": 0.2045, "grad_norm": 0.8613672256469727, "kl": 0.0681089535355568, "learning_rate": 4.835511185802574e-06, "loss": 0.0027, "reward": 1.1624999046325684, "reward_std": 0.6214901804924011, "rewards/_accuracy_reward": 0.28749996423721313, "rewards/_format_reward": 0.875, "step": 818 }, { "completion_length": 133.375, "epoch": 0.20475, "grad_norm": 0.030039411038160324, "kl": 0.06410800665616989, "learning_rate": 4.834732015022786e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 819 }, { "completion_length": 129.625, "epoch": 0.205, "grad_norm": 0.03727323189377785, "kl": 0.06535054743289948, "learning_rate": 4.833951066243004e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 820 }, { "completion_length": 127.125, "epoch": 0.20525, "grad_norm": 0.7753097414970398, "kl": 0.05497225001454353, "learning_rate": 4.833168340057957e-06, "loss": 0.0022, "reward": 1.6687500476837158, "reward_std": 0.4613160789012909, "rewards/_accuracy_reward": 0.6687500476837158, "rewards/_format_reward": 1.0, "step": 821 }, { "completion_length": 117.5, "epoch": 0.2055, "grad_norm": 1.0983308553695679, "kl": 0.07010284066200256, "learning_rate": 4.832383837063723e-06, "loss": 0.0028, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 822 }, { "completion_length": 182.0, "epoch": 0.20575, "grad_norm": 0.9410927891731262, "kl": 0.08402031660079956, "learning_rate": 4.831597557857736e-06, "loss": 0.0034, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 823 }, { "completion_length": 100.375, "epoch": 0.206, "grad_norm": 0.8562926650047302, "kl": 0.10385450720787048, "learning_rate": 4.830809503038781e-06, "loss": 0.0042, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 824 }, { "completion_length": 92.875, "epoch": 0.20625, "grad_norm": 0.051143430173397064, "kl": 0.07470560818910599, "learning_rate": 4.830019673206997e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 825 }, { "completion_length": 173.75, "epoch": 0.2065, "grad_norm": 0.4477454423904419, "kl": 0.055387578904628754, "learning_rate": 4.829228068963873e-06, "loss": 0.0022, "reward": 0.9087499380111694, "reward_std": 0.3676348030567169, "rewards/_accuracy_reward": 0.03374999761581421, "rewards/_format_reward": 0.875, "step": 826 }, { "completion_length": 123.5, "epoch": 0.20675, "grad_norm": 0.7335723638534546, "kl": 0.06967341154813766, "learning_rate": 4.828434690912251e-06, "loss": 0.0028, "reward": 1.4562499523162842, "reward_std": 0.45781898498535156, "rewards/_accuracy_reward": 0.45624998211860657, "rewards/_format_reward": 1.0, "step": 827 }, { "completion_length": 158.375, "epoch": 0.207, "grad_norm": 0.787886917591095, "kl": 0.04925302416086197, "learning_rate": 4.8276395396563215e-06, "loss": 0.002, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 828 }, { "completion_length": 145.5, "epoch": 0.20725, "grad_norm": 0.045704782009124756, "kl": 0.06455915421247482, "learning_rate": 4.826842615801628e-06, "loss": 0.0026, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 829 }, { "completion_length": 159.75, "epoch": 0.2075, "grad_norm": 0.7860205173492432, "kl": 0.06909380853176117, "learning_rate": 4.826043919955062e-06, "loss": 0.0028, "reward": 1.212499976158142, "reward_std": 0.6180325746536255, "rewards/_accuracy_reward": 0.3374999761581421, "rewards/_format_reward": 0.875, "step": 830 }, { "completion_length": 141.125, "epoch": 0.20775, "grad_norm": 0.7281314134597778, "kl": 0.05228433758020401, "learning_rate": 4.825243452724865e-06, "loss": 0.0021, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 831 }, { "completion_length": 105.375, "epoch": 0.208, "grad_norm": 0.6620430946350098, "kl": 0.057157788425683975, "learning_rate": 4.824441214720629e-06, "loss": 0.0023, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 832 }, { "completion_length": 161.375, "epoch": 0.20825, "grad_norm": 0.6484058499336243, "kl": 0.05974971503019333, "learning_rate": 4.823637206553292e-06, "loss": 0.0024, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 833 }, { "completion_length": 150.125, "epoch": 0.2085, "grad_norm": 0.6747198104858398, "kl": 0.0536816343665123, "learning_rate": 4.8228314288351405e-06, "loss": 0.0021, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 834 }, { "completion_length": 149.125, "epoch": 0.20875, "grad_norm": 0.7284613847732544, "kl": 0.04191889986395836, "learning_rate": 4.822023882179811e-06, "loss": 0.0017, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 835 }, { "completion_length": 111.25, "epoch": 0.209, "grad_norm": 0.880097508430481, "kl": 0.056231893599033356, "learning_rate": 4.821214567202284e-06, "loss": 0.0022, "reward": 1.693750023841858, "reward_std": 0.4271479547023773, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 1.0, "step": 836 }, { "completion_length": 110.375, "epoch": 0.20925, "grad_norm": 0.04309391230344772, "kl": 0.044359609484672546, "learning_rate": 4.820403484518889e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 837 }, { "completion_length": 160.125, "epoch": 0.2095, "grad_norm": 0.6122051477432251, "kl": 0.07158859074115753, "learning_rate": 4.8195906347473e-06, "loss": 0.0029, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 838 }, { "completion_length": 144.75, "epoch": 0.20975, "grad_norm": 0.739141047000885, "kl": 0.04880441725254059, "learning_rate": 4.818776018506538e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 839 }, { "completion_length": 158.75, "epoch": 0.21, "grad_norm": 0.7204335331916809, "kl": 0.08992164582014084, "learning_rate": 4.817959636416969e-06, "loss": 0.0036, "reward": 1.693750023841858, "reward_std": 0.4271479547023773, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 1.0, "step": 840 }, { "completion_length": 156.25, "epoch": 0.21025, "grad_norm": 0.9104655385017395, "kl": 0.060113731771707535, "learning_rate": 4.817141489100302e-06, "loss": 0.0024, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 841 }, { "completion_length": 163.0, "epoch": 0.2105, "grad_norm": 0.6259116530418396, "kl": 0.05476780980825424, "learning_rate": 4.816321577179594e-06, "loss": 0.0022, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 842 }, { "completion_length": 137.0, "epoch": 0.21075, "grad_norm": 0.04252244159579277, "kl": 0.06695520132780075, "learning_rate": 4.815499901279242e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 843 }, { "completion_length": 139.75, "epoch": 0.211, "grad_norm": 0.025353508070111275, "kl": 0.05481298640370369, "learning_rate": 4.814676462024988e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 844 }, { "completion_length": 78.125, "epoch": 0.21125, "grad_norm": 1.239617943763733, "kl": 0.08213215321302414, "learning_rate": 4.8138512600439165e-06, "loss": 0.0033, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 845 }, { "completion_length": 151.0, "epoch": 0.2115, "grad_norm": 0.6661514639854431, "kl": 0.061599262058734894, "learning_rate": 4.8130242959644555e-06, "loss": 0.0025, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 846 }, { "completion_length": 188.625, "epoch": 0.21175, "grad_norm": 0.6768038272857666, "kl": 0.08369094878435135, "learning_rate": 4.812195570416374e-06, "loss": 0.0033, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 847 }, { "completion_length": 122.875, "epoch": 0.212, "grad_norm": 0.029910584911704063, "kl": 0.048999685794115067, "learning_rate": 4.811365084030784e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 848 }, { "completion_length": 118.5, "epoch": 0.21225, "grad_norm": 0.8244682550430298, "kl": 0.0865970253944397, "learning_rate": 4.810532837440134e-06, "loss": 0.0035, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 849 }, { "completion_length": 151.25, "epoch": 0.2125, "grad_norm": 0.03901531174778938, "kl": 0.05925080180168152, "learning_rate": 4.809698831278217e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 850 }, { "completion_length": 99.125, "epoch": 0.21275, "grad_norm": 0.764251708984375, "kl": 0.08566372096538544, "learning_rate": 4.808863066180167e-06, "loss": 0.0034, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 851 }, { "completion_length": 149.375, "epoch": 0.213, "grad_norm": 0.04004070907831192, "kl": 0.0646246075630188, "learning_rate": 4.808025542782453e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 852 }, { "completion_length": 148.875, "epoch": 0.21325, "grad_norm": 0.6666520237922668, "kl": 0.037436630576848984, "learning_rate": 4.807186261722886e-06, "loss": 0.0015, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 853 }, { "completion_length": 127.5, "epoch": 0.2135, "grad_norm": 0.6818517446517944, "kl": 0.04620020091533661, "learning_rate": 4.806345223640616e-06, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 854 }, { "completion_length": 176.625, "epoch": 0.21375, "grad_norm": 0.7384047508239746, "kl": 0.05121048539876938, "learning_rate": 4.80550242917613e-06, "loss": 0.002, "reward": 1.7512500286102295, "reward_std": 0.460603266954422, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 0.875, "step": 855 }, { "completion_length": 154.25, "epoch": 0.214, "grad_norm": 0.6194064617156982, "kl": 0.059958089143037796, "learning_rate": 4.804657878971252e-06, "loss": 0.0024, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 856 }, { "completion_length": 98.0, "epoch": 0.21425, "grad_norm": 0.8191832304000854, "kl": 0.058027323335409164, "learning_rate": 4.803811573669143e-06, "loss": 0.0023, "reward": 1.4562499523162842, "reward_std": 0.45781898498535156, "rewards/_accuracy_reward": 0.45624998211860657, "rewards/_format_reward": 1.0, "step": 857 }, { "completion_length": 108.375, "epoch": 0.2145, "grad_norm": 0.6157840490341187, "kl": 0.03446981683373451, "learning_rate": 4.802963513914304e-06, "loss": 0.0014, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 858 }, { "completion_length": 139.375, "epoch": 0.21475, "grad_norm": 0.5992308259010315, "kl": 0.03977646678686142, "learning_rate": 4.802113700352567e-06, "loss": 0.0016, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 859 }, { "completion_length": 130.125, "epoch": 0.215, "grad_norm": 0.6911972165107727, "kl": 0.06907260417938232, "learning_rate": 4.801262133631101e-06, "loss": 0.0028, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 860 }, { "completion_length": 144.25, "epoch": 0.21525, "grad_norm": 0.928624153137207, "kl": 0.06295520812273026, "learning_rate": 4.800408814398414e-06, "loss": 0.0025, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 861 }, { "completion_length": 122.625, "epoch": 0.2155, "grad_norm": 0.8381000757217407, "kl": 0.05789494886994362, "learning_rate": 4.799553743304345e-06, "loss": 0.0023, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 862 }, { "completion_length": 152.625, "epoch": 0.21575, "grad_norm": 0.5595877766609192, "kl": 0.057784553617239, "learning_rate": 4.798696921000066e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 863 }, { "completion_length": 132.5, "epoch": 0.216, "grad_norm": 0.02260027453303337, "kl": 0.05030575767159462, "learning_rate": 4.7978383481380865e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 864 }, { "completion_length": 118.875, "epoch": 0.21625, "grad_norm": 0.7802910804748535, "kl": 0.05282498896121979, "learning_rate": 4.796978025372247e-06, "loss": 0.0021, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.875, "step": 865 }, { "completion_length": 156.125, "epoch": 0.2165, "grad_norm": 0.02012362889945507, "kl": 0.04851672798395157, "learning_rate": 4.796115953357718e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 866 }, { "completion_length": 138.5, "epoch": 0.21675, "grad_norm": 0.03598388284444809, "kl": 0.0767478421330452, "learning_rate": 4.795252132751008e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 867 }, { "completion_length": 121.125, "epoch": 0.217, "grad_norm": 0.9107113480567932, "kl": 0.0775720402598381, "learning_rate": 4.794386564209953e-06, "loss": 0.0031, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 868 }, { "completion_length": 168.875, "epoch": 0.21725, "grad_norm": 0.6234192848205566, "kl": 0.06976676732301712, "learning_rate": 4.793519248393721e-06, "loss": 0.0028, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.6375000476837158, "rewards/_format_reward": 0.875, "step": 869 }, { "completion_length": 133.75, "epoch": 0.2175, "grad_norm": 0.7727608680725098, "kl": 0.12097954005002975, "learning_rate": 4.79265018596281e-06, "loss": 0.0048, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 870 }, { "completion_length": 146.625, "epoch": 0.21775, "grad_norm": 0.5868052244186401, "kl": 0.042058780789375305, "learning_rate": 4.791779377579051e-06, "loss": 0.0017, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 871 }, { "completion_length": 203.625, "epoch": 0.218, "grad_norm": 0.023968705907464027, "kl": 0.05295765399932861, "learning_rate": 4.790906823905599e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 872 }, { "completion_length": 167.625, "epoch": 0.21825, "grad_norm": 0.563790500164032, "kl": 0.03417491540312767, "learning_rate": 4.790032525606945e-06, "loss": 0.0014, "reward": 1.1687499284744263, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 1.0, "step": 873 }, { "completion_length": 104.875, "epoch": 0.2185, "grad_norm": 0.6778443455696106, "kl": 0.05654909834265709, "learning_rate": 4.7891564833489035e-06, "loss": 0.0023, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 874 }, { "completion_length": 156.25, "epoch": 0.21875, "grad_norm": 0.693367600440979, "kl": 0.05336631089448929, "learning_rate": 4.788278697798619e-06, "loss": 0.0021, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 875 }, { "completion_length": 187.125, "epoch": 0.219, "grad_norm": 0.6634543538093567, "kl": 0.06201925501227379, "learning_rate": 4.787399169624562e-06, "loss": 0.0025, "reward": 1.1937499046325684, "reward_std": 0.33320683240890503, "rewards/_accuracy_reward": 0.19374999403953552, "rewards/_format_reward": 1.0, "step": 876 }, { "completion_length": 169.125, "epoch": 0.21925, "grad_norm": 0.625119149684906, "kl": 0.06440308690071106, "learning_rate": 4.786517899496535e-06, "loss": 0.0026, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 877 }, { "completion_length": 101.875, "epoch": 0.2195, "grad_norm": 0.7212375402450562, "kl": 0.06768248230218887, "learning_rate": 4.7856348880856595e-06, "loss": 0.0027, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 878 }, { "completion_length": 125.5, "epoch": 0.21975, "grad_norm": 0.8191072344779968, "kl": 0.08255218714475632, "learning_rate": 4.78475013606439e-06, "loss": 0.0033, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 879 }, { "completion_length": 165.0, "epoch": 0.22, "grad_norm": 0.5947607159614563, "kl": 0.03656945377588272, "learning_rate": 4.783863644106502e-06, "loss": 0.0015, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 880 }, { "completion_length": 84.0, "epoch": 0.22025, "grad_norm": 0.7441555857658386, "kl": 0.1628367006778717, "learning_rate": 4.7829754128871e-06, "loss": 0.0065, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 881 }, { "completion_length": 123.5, "epoch": 0.2205, "grad_norm": 0.697911262512207, "kl": 0.08555817604064941, "learning_rate": 4.782085443082607e-06, "loss": 0.0034, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 882 }, { "completion_length": 120.125, "epoch": 0.22075, "grad_norm": 0.02246098220348358, "kl": 0.05425877869129181, "learning_rate": 4.7811937353707776e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 883 }, { "completion_length": 231.875, "epoch": 0.221, "grad_norm": 0.4748757779598236, "kl": 0.04530277103185654, "learning_rate": 4.780300290430683e-06, "loss": 0.0018, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.625, "step": 884 }, { "completion_length": 178.625, "epoch": 0.22125, "grad_norm": 0.7197502255439758, "kl": 0.05272772163152695, "learning_rate": 4.779405108942722e-06, "loss": 0.0021, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 885 }, { "completion_length": 112.125, "epoch": 0.2215, "grad_norm": 0.07369816303253174, "kl": 0.053687069565057755, "learning_rate": 4.778508191588613e-06, "loss": 0.0021, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 886 }, { "completion_length": 111.25, "epoch": 0.22175, "grad_norm": 0.04976990818977356, "kl": 0.053799863904714584, "learning_rate": 4.7776095390514e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 887 }, { "completion_length": 105.25, "epoch": 0.222, "grad_norm": 0.9817723631858826, "kl": 0.0718049630522728, "learning_rate": 4.776709152015443e-06, "loss": 0.0029, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 888 }, { "completion_length": 160.75, "epoch": 0.22225, "grad_norm": 0.8418934345245361, "kl": 0.046997714787721634, "learning_rate": 4.775807031166428e-06, "loss": 0.0019, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 889 }, { "completion_length": 164.125, "epoch": 0.2225, "grad_norm": 0.637934148311615, "kl": 0.05987370014190674, "learning_rate": 4.774903177191358e-06, "loss": 0.0024, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 890 }, { "completion_length": 205.5, "epoch": 0.22275, "grad_norm": 0.4765873849391937, "kl": 0.052201397716999054, "learning_rate": 4.773997590778558e-06, "loss": 0.0021, "reward": 1.4387500286102295, "reward_std": 0.7156503200531006, "rewards/_accuracy_reward": 0.5637500286102295, "rewards/_format_reward": 0.875, "step": 891 }, { "completion_length": 173.25, "epoch": 0.223, "grad_norm": 0.5394091010093689, "kl": 0.07298759371042252, "learning_rate": 4.773090272617672e-06, "loss": 0.0029, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 892 }, { "completion_length": 156.375, "epoch": 0.22325, "grad_norm": 0.7306042909622192, "kl": 0.06932543963193893, "learning_rate": 4.77218122339966e-06, "loss": 0.0028, "reward": 1.5012500286102295, "reward_std": 0.4208982586860657, "rewards/_accuracy_reward": 0.5012500286102295, "rewards/_format_reward": 1.0, "step": 893 }, { "completion_length": 138.75, "epoch": 0.2235, "grad_norm": 0.021331820636987686, "kl": 0.037514664232730865, "learning_rate": 4.771270443816805e-06, "loss": 0.0015, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 894 }, { "completion_length": 103.25, "epoch": 0.22375, "grad_norm": 0.024962345138192177, "kl": 0.04039287567138672, "learning_rate": 4.770357934562704e-06, "loss": 0.0016, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 895 }, { "completion_length": 129.5, "epoch": 0.224, "grad_norm": 0.022136209532618523, "kl": 0.05364343896508217, "learning_rate": 4.769443696332272e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 896 }, { "completion_length": 123.875, "epoch": 0.22425, "grad_norm": 1.0361751317977905, "kl": 0.08603714406490326, "learning_rate": 4.7685277298217425e-06, "loss": 0.0034, "reward": 1.3624999523162842, "reward_std": 0.404218852519989, "rewards/_accuracy_reward": 0.36249998211860657, "rewards/_format_reward": 1.0, "step": 897 }, { "completion_length": 114.5, "epoch": 0.2245, "grad_norm": 0.8131916522979736, "kl": 0.07286559790372849, "learning_rate": 4.767610035728663e-06, "loss": 0.0029, "reward": 1.337499976158142, "reward_std": 0.4181165397167206, "rewards/_accuracy_reward": 0.3374999761581421, "rewards/_format_reward": 1.0, "step": 898 }, { "completion_length": 197.25, "epoch": 0.22475, "grad_norm": 0.5685933232307434, "kl": 0.053996216505765915, "learning_rate": 4.766690614751897e-06, "loss": 0.0022, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 899 }, { "completion_length": 80.25, "epoch": 0.225, "grad_norm": 0.023312676697969437, "kl": 0.046684399247169495, "learning_rate": 4.765769467591626e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 900 }, { "completion_length": 173.625, "epoch": 0.22525, "grad_norm": 0.6596946120262146, "kl": 0.04578675329685211, "learning_rate": 4.76484659494934e-06, "loss": 0.0018, "reward": 1.6637499332427979, "reward_std": 0.4691310524940491, "rewards/_accuracy_reward": 0.6637499928474426, "rewards/_format_reward": 1.0, "step": 901 }, { "completion_length": 91.875, "epoch": 0.2255, "grad_norm": 0.9707310795783997, "kl": 0.03129902854561806, "learning_rate": 4.763921997527849e-06, "loss": 0.0013, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 902 }, { "completion_length": 134.25, "epoch": 0.22575, "grad_norm": 0.7488545775413513, "kl": 0.07043536752462387, "learning_rate": 4.762995676031275e-06, "loss": 0.0028, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 903 }, { "completion_length": 179.125, "epoch": 0.226, "grad_norm": 0.5836617350578308, "kl": 0.047344304621219635, "learning_rate": 4.762067631165049e-06, "loss": 0.0019, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 904 }, { "completion_length": 177.875, "epoch": 0.22625, "grad_norm": 0.5657011270523071, "kl": 0.03409140184521675, "learning_rate": 4.761137863635921e-06, "loss": 0.0014, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 905 }, { "completion_length": 183.125, "epoch": 0.2265, "grad_norm": 0.481889009475708, "kl": 0.048406727612018585, "learning_rate": 4.760206374151947e-06, "loss": 0.0019, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 906 }, { "completion_length": 135.75, "epoch": 0.22675, "grad_norm": 0.6987292170524597, "kl": 0.042507898062467575, "learning_rate": 4.759273163422496e-06, "loss": 0.0017, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 907 }, { "completion_length": 215.875, "epoch": 0.227, "grad_norm": 0.5568097829818726, "kl": 0.04911498725414276, "learning_rate": 4.7583382321582525e-06, "loss": 0.002, "reward": 1.131250023841858, "reward_std": 0.9902876615524292, "rewards/_accuracy_reward": 0.5062500238418579, "rewards/_format_reward": 0.625, "step": 908 }, { "completion_length": 178.125, "epoch": 0.22725, "grad_norm": 0.6918895244598389, "kl": 0.04337242990732193, "learning_rate": 4.757401581071203e-06, "loss": 0.0017, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 909 }, { "completion_length": 170.875, "epoch": 0.2275, "grad_norm": 0.04047883674502373, "kl": 0.06391174346208572, "learning_rate": 4.7564632108746524e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 910 }, { "completion_length": 190.0, "epoch": 0.22775, "grad_norm": 0.6083565950393677, "kl": 0.04682445526123047, "learning_rate": 4.755523122283206e-06, "loss": 0.0019, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 911 }, { "completion_length": 135.5, "epoch": 0.228, "grad_norm": 0.7495693564414978, "kl": 0.0452733151614666, "learning_rate": 4.754581316012785e-06, "loss": 0.0018, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 912 }, { "completion_length": 143.0, "epoch": 0.22825, "grad_norm": 0.9222220778465271, "kl": 0.038283564150333405, "learning_rate": 4.753637792780614e-06, "loss": 0.0015, "reward": 1.1624999046325684, "reward_std": 0.338853120803833, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 0.875, "step": 913 }, { "completion_length": 123.75, "epoch": 0.2285, "grad_norm": 0.6918537616729736, "kl": 0.04517769068479538, "learning_rate": 4.752692553305229e-06, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 914 }, { "completion_length": 157.5, "epoch": 0.22875, "grad_norm": 0.9993019700050354, "kl": 0.061792753636837006, "learning_rate": 4.7517455983064694e-06, "loss": 0.0025, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 915 }, { "completion_length": 137.75, "epoch": 0.229, "grad_norm": 0.8047826290130615, "kl": 0.05265399068593979, "learning_rate": 4.750796928505484e-06, "loss": 0.0021, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 916 }, { "completion_length": 70.5, "epoch": 0.22925, "grad_norm": 1.218523383140564, "kl": 0.3203689157962799, "learning_rate": 4.749846544624725e-06, "loss": 0.0128, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 917 }, { "completion_length": 166.5, "epoch": 0.2295, "grad_norm": 0.7494506239891052, "kl": 0.047605931758880615, "learning_rate": 4.7488944473879515e-06, "loss": 0.0019, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 918 }, { "completion_length": 149.875, "epoch": 0.22975, "grad_norm": 0.5708422064781189, "kl": 0.06305453181266785, "learning_rate": 4.747940637520226e-06, "loss": 0.0025, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 919 }, { "completion_length": 146.375, "epoch": 0.23, "grad_norm": 0.6123313307762146, "kl": 0.05034901574254036, "learning_rate": 4.746985115747918e-06, "loss": 0.002, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 920 }, { "completion_length": 168.75, "epoch": 0.23025, "grad_norm": 0.02322268672287464, "kl": 0.04557321220636368, "learning_rate": 4.746027882798697e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 921 }, { "completion_length": 197.0, "epoch": 0.2305, "grad_norm": 0.6486432552337646, "kl": 0.06248953938484192, "learning_rate": 4.745068939401539e-06, "loss": 0.0025, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 922 }, { "completion_length": 163.75, "epoch": 0.23075, "grad_norm": 0.6944877505302429, "kl": 0.051626596599817276, "learning_rate": 4.744108286286721e-06, "loss": 0.0021, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 923 }, { "completion_length": 176.0, "epoch": 0.231, "grad_norm": 0.603113055229187, "kl": 0.07564710080623627, "learning_rate": 4.743145924185821e-06, "loss": 0.003, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 924 }, { "completion_length": 177.0, "epoch": 0.23125, "grad_norm": 0.5147649645805359, "kl": 0.06837765127420425, "learning_rate": 4.742181853831721e-06, "loss": 0.0027, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 925 }, { "completion_length": 155.75, "epoch": 0.2315, "grad_norm": 0.6233566403388977, "kl": 0.06354863196611404, "learning_rate": 4.741216075958602e-06, "loss": 0.0025, "reward": 1.412500023841858, "reward_std": 0.36912059783935547, "rewards/_accuracy_reward": 0.4124999940395355, "rewards/_format_reward": 1.0, "step": 926 }, { "completion_length": 184.125, "epoch": 0.23175, "grad_norm": 0.5731986165046692, "kl": 0.050587963312864304, "learning_rate": 4.740248591301945e-06, "loss": 0.002, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 927 }, { "completion_length": 100.5, "epoch": 0.232, "grad_norm": 1.4272077083587646, "kl": 0.08796297013759613, "learning_rate": 4.7392794005985324e-06, "loss": 0.0035, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 928 }, { "completion_length": 114.625, "epoch": 0.23225, "grad_norm": 0.027059296146035194, "kl": 0.060833338648080826, "learning_rate": 4.738308504586445e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 929 }, { "completion_length": 90.875, "epoch": 0.2325, "grad_norm": 0.9342624545097351, "kl": 0.058008529245853424, "learning_rate": 4.737335904005063e-06, "loss": 0.0023, "reward": 1.6687500476837158, "reward_std": 0.4613160789012909, "rewards/_accuracy_reward": 0.668749988079071, "rewards/_format_reward": 1.0, "step": 930 }, { "completion_length": 138.625, "epoch": 0.23275, "grad_norm": 0.030422937124967575, "kl": 0.04037817567586899, "learning_rate": 4.736361599595063e-06, "loss": 0.0016, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 931 }, { "completion_length": 101.25, "epoch": 0.233, "grad_norm": 0.7583170533180237, "kl": 0.08360082656145096, "learning_rate": 4.735385592098421e-06, "loss": 0.0033, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 932 }, { "completion_length": 158.625, "epoch": 0.23325, "grad_norm": 0.6318457722663879, "kl": 0.05788834020495415, "learning_rate": 4.734407882258408e-06, "loss": 0.0023, "reward": 1.625, "reward_std": 0.40089187026023865, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 1.0, "step": 933 }, { "completion_length": 176.5, "epoch": 0.2335, "grad_norm": 0.6953318119049072, "kl": 0.0506259948015213, "learning_rate": 4.733428470819595e-06, "loss": 0.002, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 934 }, { "completion_length": 118.5, "epoch": 0.23375, "grad_norm": 0.021268269047141075, "kl": 0.03520062938332558, "learning_rate": 4.732447358527843e-06, "loss": 0.0014, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 935 }, { "completion_length": 102.875, "epoch": 0.234, "grad_norm": 0.8448748588562012, "kl": 0.07049023360013962, "learning_rate": 4.731464546130315e-06, "loss": 0.0028, "reward": 1.6375000476837158, "reward_std": 0.7224709987640381, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 0.875, "step": 936 }, { "completion_length": 159.875, "epoch": 0.23425, "grad_norm": 0.8189094066619873, "kl": 0.0628812164068222, "learning_rate": 4.730480034375462e-06, "loss": 0.0025, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 937 }, { "completion_length": 165.875, "epoch": 0.2345, "grad_norm": 0.026304002851247787, "kl": 0.06353601813316345, "learning_rate": 4.729493824013036e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 938 }, { "completion_length": 191.375, "epoch": 0.23475, "grad_norm": 0.6404834389686584, "kl": 0.040367912501096725, "learning_rate": 4.7285059157940765e-06, "loss": 0.0016, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 939 }, { "completion_length": 180.0, "epoch": 0.235, "grad_norm": 0.660751461982727, "kl": 0.11275404691696167, "learning_rate": 4.72751631047092e-06, "loss": 0.0045, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 940 }, { "completion_length": 176.875, "epoch": 0.23525, "grad_norm": 0.03521808236837387, "kl": 0.05358295515179634, "learning_rate": 4.726525008797194e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 941 }, { "completion_length": 91.25, "epoch": 0.2355, "grad_norm": 0.06520809978246689, "kl": 0.06617758423089981, "learning_rate": 4.725532011527817e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 942 }, { "completion_length": 193.625, "epoch": 0.23575, "grad_norm": 0.029271895065903664, "kl": 0.060000792145729065, "learning_rate": 4.724537319419e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 943 }, { "completion_length": 169.75, "epoch": 0.236, "grad_norm": 0.7201210260391235, "kl": 0.06569670885801315, "learning_rate": 4.723540933228245e-06, "loss": 0.0026, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 944 }, { "completion_length": 177.875, "epoch": 0.23625, "grad_norm": 0.5506649613380432, "kl": 0.04273473471403122, "learning_rate": 4.7225428537143414e-06, "loss": 0.0017, "reward": 1.6687500476837158, "reward_std": 0.4613160789012909, "rewards/_accuracy_reward": 0.668749988079071, "rewards/_format_reward": 1.0, "step": 945 }, { "completion_length": 98.125, "epoch": 0.2365, "grad_norm": 0.021767426282167435, "kl": 0.08275524526834488, "learning_rate": 4.721543081637372e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 946 }, { "completion_length": 93.875, "epoch": 0.23675, "grad_norm": 0.7869265675544739, "kl": 0.06405540555715561, "learning_rate": 4.720541617758707e-06, "loss": 0.0026, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 947 }, { "completion_length": 172.125, "epoch": 0.237, "grad_norm": 0.5938262939453125, "kl": 0.06068947911262512, "learning_rate": 4.719538462841003e-06, "loss": 0.0024, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 948 }, { "completion_length": 112.25, "epoch": 0.23725, "grad_norm": 1.1759775876998901, "kl": 0.2916286885738373, "learning_rate": 4.718533617648209e-06, "loss": 0.0117, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 949 }, { "completion_length": 86.75, "epoch": 0.2375, "grad_norm": 0.8917730450630188, "kl": 0.08274582028388977, "learning_rate": 4.717527082945555e-06, "loss": 0.0033, "reward": 1.693750023841858, "reward_std": 0.4271479547023773, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 1.0, "step": 950 }, { "completion_length": 155.625, "epoch": 0.23775, "grad_norm": 0.02533833496272564, "kl": 0.04348806291818619, "learning_rate": 4.716518859499563e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 951 }, { "completion_length": 100.25, "epoch": 0.238, "grad_norm": 0.669034481048584, "kl": 0.039708979427814484, "learning_rate": 4.715508948078037e-06, "loss": 0.0016, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 952 }, { "completion_length": 93.25, "epoch": 0.23825, "grad_norm": 1.0357922315597534, "kl": 0.060257647186517715, "learning_rate": 4.714497349450071e-06, "loss": 0.0024, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 953 }, { "completion_length": 111.75, "epoch": 0.2385, "grad_norm": 0.7821860909461975, "kl": 0.03877865895628929, "learning_rate": 4.71348406438604e-06, "loss": 0.0016, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 954 }, { "completion_length": 155.625, "epoch": 0.23875, "grad_norm": 0.04612157121300697, "kl": 0.04425455257296562, "learning_rate": 4.712469093657605e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 955 }, { "completion_length": 149.75, "epoch": 0.239, "grad_norm": 0.7191103100776672, "kl": 0.06059260666370392, "learning_rate": 4.71145243803771e-06, "loss": 0.0024, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 956 }, { "completion_length": 169.0, "epoch": 0.23925, "grad_norm": 0.688480019569397, "kl": 0.05014285817742348, "learning_rate": 4.710434098300584e-06, "loss": 0.002, "reward": 1.5187499523162842, "reward_std": 0.7323824167251587, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 0.875, "step": 957 }, { "completion_length": 86.0, "epoch": 0.2395, "grad_norm": 0.7771919369697571, "kl": 0.06990315020084381, "learning_rate": 4.709414075221734e-06, "loss": 0.0028, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 958 }, { "completion_length": 126.75, "epoch": 0.23975, "grad_norm": 0.815279483795166, "kl": 0.07264941185712814, "learning_rate": 4.7083923695779546e-06, "loss": 0.0029, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 959 }, { "completion_length": 171.0, "epoch": 0.24, "grad_norm": 0.5627329349517822, "kl": 0.03297321870923042, "learning_rate": 4.707368982147318e-06, "loss": 0.0013, "reward": 1.0625, "reward_std": 0.5403371453285217, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.875, "step": 960 }, { "completion_length": 158.25, "epoch": 0.24025, "grad_norm": 0.4752623438835144, "kl": 0.046661727130413055, "learning_rate": 4.706343913709178e-06, "loss": 0.0019, "reward": 1.631250023841858, "reward_std": 0.7382108569145203, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 961 }, { "completion_length": 150.5, "epoch": 0.2405, "grad_norm": 0.06294679641723633, "kl": 0.06743825972080231, "learning_rate": 4.70531716504417e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 962 }, { "completion_length": 148.625, "epoch": 0.24075, "grad_norm": 0.8773144483566284, "kl": 0.05056190490722656, "learning_rate": 4.704288736934207e-06, "loss": 0.002, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 963 }, { "completion_length": 144.375, "epoch": 0.241, "grad_norm": 0.5290398001670837, "kl": 0.03290877863764763, "learning_rate": 4.703258630162481e-06, "loss": 0.0013, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 964 }, { "completion_length": 194.125, "epoch": 0.24125, "grad_norm": 0.6033351421356201, "kl": 0.05277324095368385, "learning_rate": 4.702226845513465e-06, "loss": 0.0021, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 1.0, "step": 965 }, { "completion_length": 169.5, "epoch": 0.2415, "grad_norm": 0.6577865481376648, "kl": 0.05829022079706192, "learning_rate": 4.701193383772905e-06, "loss": 0.0023, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 966 }, { "completion_length": 170.5, "epoch": 0.24175, "grad_norm": 0.6581544280052185, "kl": 0.05274191126227379, "learning_rate": 4.70015824572783e-06, "loss": 0.0021, "reward": 1.7825000286102295, "reward_std": 0.40780770778656006, "rewards/_accuracy_reward": 0.7825000286102295, "rewards/_format_reward": 1.0, "step": 967 }, { "completion_length": 155.75, "epoch": 0.242, "grad_norm": 0.6504667401313782, "kl": 0.04294908419251442, "learning_rate": 4.699121432166542e-06, "loss": 0.0017, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 968 }, { "completion_length": 113.125, "epoch": 0.24225, "grad_norm": 0.02235039882361889, "kl": 0.05611787736415863, "learning_rate": 4.6980829438786176e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 969 }, { "completion_length": 194.25, "epoch": 0.2425, "grad_norm": 0.6296171545982361, "kl": 0.05270276963710785, "learning_rate": 4.697042781654913e-06, "loss": 0.0021, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 970 }, { "completion_length": 122.75, "epoch": 0.24275, "grad_norm": 0.02494359202682972, "kl": 0.0459245890378952, "learning_rate": 4.696000946287558e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 971 }, { "completion_length": 139.5, "epoch": 0.243, "grad_norm": 0.6154767870903015, "kl": 0.053218137472867966, "learning_rate": 4.6949574385699514e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 972 }, { "completion_length": 145.875, "epoch": 0.24325, "grad_norm": 0.6306710243225098, "kl": 0.05243955925107002, "learning_rate": 4.693912259296773e-06, "loss": 0.0021, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 973 }, { "completion_length": 181.375, "epoch": 0.2435, "grad_norm": 0.5748218297958374, "kl": 0.07226546108722687, "learning_rate": 4.6928654092639725e-06, "loss": 0.0029, "reward": 0.9124999046325684, "reward_std": 0.6384971141815186, "rewards/_accuracy_reward": 0.16249999403953552, "rewards/_format_reward": 0.75, "step": 974 }, { "completion_length": 81.0, "epoch": 0.24375, "grad_norm": 0.027224192395806313, "kl": 0.04066862910985947, "learning_rate": 4.69181688926877e-06, "loss": 0.0016, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 975 }, { "completion_length": 139.375, "epoch": 0.244, "grad_norm": 0.03555990010499954, "kl": 0.0650315135717392, "learning_rate": 4.690766700109659e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 976 }, { "completion_length": 84.125, "epoch": 0.24425, "grad_norm": 1.2255222797393799, "kl": 0.055360615253448486, "learning_rate": 4.689714842586406e-06, "loss": 0.0022, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 977 }, { "completion_length": 161.875, "epoch": 0.2445, "grad_norm": 0.044598329812288284, "kl": 0.059485312551259995, "learning_rate": 4.688661317500045e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 978 }, { "completion_length": 122.125, "epoch": 0.24475, "grad_norm": 0.9748655557632446, "kl": 0.13712793588638306, "learning_rate": 4.687606125652882e-06, "loss": 0.0055, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 979 }, { "completion_length": 142.75, "epoch": 0.245, "grad_norm": 0.02790018729865551, "kl": 0.052116554230451584, "learning_rate": 4.68654926784849e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 980 }, { "completion_length": 154.0, "epoch": 0.24525, "grad_norm": 0.6151390075683594, "kl": 0.06243494153022766, "learning_rate": 4.685490744891713e-06, "loss": 0.0025, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 981 }, { "completion_length": 144.875, "epoch": 0.2455, "grad_norm": 0.7018941640853882, "kl": 0.03996715694665909, "learning_rate": 4.6844305575886635e-06, "loss": 0.0016, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 982 }, { "completion_length": 190.875, "epoch": 0.24575, "grad_norm": 0.6229122281074524, "kl": 0.047085534781217575, "learning_rate": 4.6833687067467185e-06, "loss": 0.0019, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 983 }, { "completion_length": 156.875, "epoch": 0.246, "grad_norm": 0.9269128441810608, "kl": 0.08278842270374298, "learning_rate": 4.682305193174524e-06, "loss": 0.0033, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 984 }, { "completion_length": 130.75, "epoch": 0.24625, "grad_norm": 0.8343520760536194, "kl": 0.030732639133930206, "learning_rate": 4.681240017681994e-06, "loss": 0.0012, "reward": 0.9249999523162842, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 0.875, "step": 985 }, { "completion_length": 183.75, "epoch": 0.2465, "grad_norm": 0.4889462888240814, "kl": 0.02942308969795704, "learning_rate": 4.680173181080302e-06, "loss": 0.0012, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 986 }, { "completion_length": 168.5, "epoch": 0.24675, "grad_norm": 0.733590841293335, "kl": 0.044175948947668076, "learning_rate": 4.679104684181893e-06, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 987 }, { "completion_length": 132.125, "epoch": 0.247, "grad_norm": 0.7705006003379822, "kl": 0.04930579662322998, "learning_rate": 4.6780345278004744e-06, "loss": 0.002, "reward": 1.6637500524520874, "reward_std": 0.4691310524940491, "rewards/_accuracy_reward": 0.6637500524520874, "rewards/_format_reward": 1.0, "step": 988 }, { "completion_length": 156.5, "epoch": 0.24725, "grad_norm": 0.03196730837225914, "kl": 0.046509016305208206, "learning_rate": 4.676962712751015e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 989 }, { "completion_length": 136.5, "epoch": 0.2475, "grad_norm": 0.7628892660140991, "kl": 0.10023954510688782, "learning_rate": 4.675889239849749e-06, "loss": 0.004, "reward": 1.306249976158142, "reward_std": 0.6784633994102478, "rewards/_accuracy_reward": 0.4312500059604645, "rewards/_format_reward": 0.875, "step": 990 }, { "completion_length": 210.625, "epoch": 0.24775, "grad_norm": 0.5449919700622559, "kl": 0.04874037951231003, "learning_rate": 4.674814109914174e-06, "loss": 0.0019, "reward": 1.3125, "reward_std": 0.873723566532135, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.75, "step": 991 }, { "completion_length": 102.125, "epoch": 0.248, "grad_norm": 0.07450534403324127, "kl": 0.05309867858886719, "learning_rate": 4.673737323763048e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 992 }, { "completion_length": 104.125, "epoch": 0.24825, "grad_norm": 0.7853342294692993, "kl": 0.04614270478487015, "learning_rate": 4.67265888221639e-06, "loss": 0.0018, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 993 }, { "completion_length": 125.375, "epoch": 0.2485, "grad_norm": 0.8209336400032043, "kl": 0.040556248277425766, "learning_rate": 4.671578786095479e-06, "loss": 0.0016, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 994 }, { "completion_length": 157.125, "epoch": 0.24875, "grad_norm": 0.7643418908119202, "kl": 0.07217823714017868, "learning_rate": 4.670497036222856e-06, "loss": 0.0029, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 1.0, "step": 995 }, { "completion_length": 119.75, "epoch": 0.249, "grad_norm": 0.8099098801612854, "kl": 0.053473103791475296, "learning_rate": 4.669413633422322e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 996 }, { "completion_length": 155.75, "epoch": 0.24925, "grad_norm": 0.620407223701477, "kl": 0.05117916315793991, "learning_rate": 4.668328578518933e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 997 }, { "completion_length": 183.625, "epoch": 0.2495, "grad_norm": 0.8129308223724365, "kl": 0.07624640315771103, "learning_rate": 4.667241872339007e-06, "loss": 0.003, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.75, "step": 998 }, { "completion_length": 153.625, "epoch": 0.24975, "grad_norm": 0.6230663061141968, "kl": 0.04454692453145981, "learning_rate": 4.666153515710118e-06, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 999 }, { "completion_length": 121.125, "epoch": 0.25, "grad_norm": 0.8729314804077148, "kl": 0.040495917201042175, "learning_rate": 4.665063509461098e-06, "loss": 0.0016, "reward": 1.6875, "reward_std": 0.4381372928619385, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.875, "step": 1000 }, { "completion_length": 118.375, "epoch": 0.25025, "grad_norm": 0.7109651565551758, "kl": 0.1086815893650055, "learning_rate": 4.66397185442203e-06, "loss": 0.0043, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1001 }, { "completion_length": 176.75, "epoch": 0.2505, "grad_norm": 0.536847710609436, "kl": 0.0531662292778492, "learning_rate": 4.6628785514242615e-06, "loss": 0.0021, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 1002 }, { "completion_length": 103.5, "epoch": 0.25075, "grad_norm": 0.034843314439058304, "kl": 0.048501890152692795, "learning_rate": 4.6617836013003885e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1003 }, { "completion_length": 198.25, "epoch": 0.251, "grad_norm": 0.6616882085800171, "kl": 0.07180707901716232, "learning_rate": 4.6606870048842626e-06, "loss": 0.0029, "reward": 1.2625000476837158, "reward_std": 0.8745407462120056, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.625, "step": 1004 }, { "completion_length": 110.0, "epoch": 0.25125, "grad_norm": 0.9252467155456543, "kl": 0.08372751623392105, "learning_rate": 4.65958876301099e-06, "loss": 0.0033, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1005 }, { "completion_length": 162.875, "epoch": 0.2515, "grad_norm": 0.48977896571159363, "kl": 0.05411114916205406, "learning_rate": 4.658488876516929e-06, "loss": 0.0022, "reward": 1.7825000286102295, "reward_std": 0.40780770778656006, "rewards/_accuracy_reward": 0.7825000286102295, "rewards/_format_reward": 1.0, "step": 1006 }, { "completion_length": 64.125, "epoch": 0.25175, "grad_norm": 1.0273231267929077, "kl": 0.083249032497406, "learning_rate": 4.6573873462396935e-06, "loss": 0.0033, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1007 }, { "completion_length": 165.25, "epoch": 0.252, "grad_norm": 0.5020899176597595, "kl": 0.05609561502933502, "learning_rate": 4.656284173018144e-06, "loss": 0.0022, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1008 }, { "completion_length": 142.5, "epoch": 0.25225, "grad_norm": 0.7043915390968323, "kl": 0.05257268622517586, "learning_rate": 4.655179357692396e-06, "loss": 0.0021, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1009 }, { "completion_length": 174.0, "epoch": 0.2525, "grad_norm": 0.705171525478363, "kl": 0.06622593849897385, "learning_rate": 4.654072901103815e-06, "loss": 0.0026, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1010 }, { "completion_length": 83.75, "epoch": 0.25275, "grad_norm": 0.04914606735110283, "kl": 0.05078301206231117, "learning_rate": 4.652964804095015e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1011 }, { "completion_length": 101.375, "epoch": 0.253, "grad_norm": 1.0606887340545654, "kl": 0.04142900928854942, "learning_rate": 4.65185506750986e-06, "loss": 0.0017, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1012 }, { "completion_length": 128.0, "epoch": 0.25325, "grad_norm": 0.742917001247406, "kl": 0.062295470386743546, "learning_rate": 4.650743692193462e-06, "loss": 0.0025, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.75, "step": 1013 }, { "completion_length": 113.0, "epoch": 0.2535, "grad_norm": 0.7006327509880066, "kl": 0.04041333496570587, "learning_rate": 4.649630678992184e-06, "loss": 0.0016, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1014 }, { "completion_length": 174.625, "epoch": 0.25375, "grad_norm": 0.7579224109649658, "kl": 0.07045772671699524, "learning_rate": 4.648516028753632e-06, "loss": 0.0028, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 1015 }, { "completion_length": 117.375, "epoch": 0.254, "grad_norm": 0.7735872864723206, "kl": 0.051190085709095, "learning_rate": 4.6473997423266615e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1016 }, { "completion_length": 163.375, "epoch": 0.25425, "grad_norm": 0.7195467352867126, "kl": 0.06525428593158722, "learning_rate": 4.646281820561372e-06, "loss": 0.0026, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1017 }, { "completion_length": 147.125, "epoch": 0.2545, "grad_norm": 0.021355951204895973, "kl": 0.04439732804894447, "learning_rate": 4.645162264309112e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1018 }, { "completion_length": 170.0, "epoch": 0.25475, "grad_norm": 0.6866489052772522, "kl": 0.056952353566884995, "learning_rate": 4.644041074422469e-06, "loss": 0.0023, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1019 }, { "completion_length": 166.125, "epoch": 0.255, "grad_norm": 0.6103510856628418, "kl": 0.05860072746872902, "learning_rate": 4.642918251755281e-06, "loss": 0.0023, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1020 }, { "completion_length": 161.5, "epoch": 0.25525, "grad_norm": 0.6660823822021484, "kl": 0.06608124077320099, "learning_rate": 4.641793797162625e-06, "loss": 0.0026, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1021 }, { "completion_length": 92.0, "epoch": 0.2555, "grad_norm": 0.07947294414043427, "kl": 0.09260207414627075, "learning_rate": 4.640667711500821e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1022 }, { "completion_length": 143.75, "epoch": 0.25575, "grad_norm": 0.023698054254055023, "kl": 0.04806230962276459, "learning_rate": 4.6395399956274334e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1023 }, { "completion_length": 215.5, "epoch": 0.256, "grad_norm": 0.5254735350608826, "kl": 0.037495993077754974, "learning_rate": 4.638410650401267e-06, "loss": 0.0015, "reward": 1.193750023841858, "reward_std": 0.8304204940795898, "rewards/_accuracy_reward": 0.4437499940395355, "rewards/_format_reward": 0.75, "step": 1024 }, { "completion_length": 138.0, "epoch": 0.25625, "grad_norm": 0.02838887646794319, "kl": 0.06760545819997787, "learning_rate": 4.637279676682367e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1025 }, { "completion_length": 126.375, "epoch": 0.2565, "grad_norm": 0.024629445746541023, "kl": 0.06675737351179123, "learning_rate": 4.636147075332019e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1026 }, { "completion_length": 143.0, "epoch": 0.25675, "grad_norm": 0.600119411945343, "kl": 0.06903208047151566, "learning_rate": 4.635012847212749e-06, "loss": 0.0028, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1027 }, { "completion_length": 134.625, "epoch": 0.257, "grad_norm": 0.7650855183601379, "kl": 0.059212468564510345, "learning_rate": 4.633876993188319e-06, "loss": 0.0024, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 1028 }, { "completion_length": 180.375, "epoch": 0.25725, "grad_norm": 0.5085853338241577, "kl": 0.04607116058468819, "learning_rate": 4.632739514123733e-06, "loss": 0.0018, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.875, "step": 1029 }, { "completion_length": 154.25, "epoch": 0.2575, "grad_norm": 0.6598264575004578, "kl": 0.05613408610224724, "learning_rate": 4.631600410885231e-06, "loss": 0.0022, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 1030 }, { "completion_length": 135.375, "epoch": 0.25775, "grad_norm": 0.020788883790373802, "kl": 0.06492722034454346, "learning_rate": 4.6304596843402885e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1031 }, { "completion_length": 165.375, "epoch": 0.258, "grad_norm": 0.6187155842781067, "kl": 0.045201320201158524, "learning_rate": 4.62931733535762e-06, "loss": 0.0018, "reward": 1.53125, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.53125, "rewards/_format_reward": 1.0, "step": 1032 }, { "completion_length": 195.125, "epoch": 0.25825, "grad_norm": 0.5255759358406067, "kl": 0.05828214809298515, "learning_rate": 4.628173364807171e-06, "loss": 0.0023, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1033 }, { "completion_length": 150.0, "epoch": 0.2585, "grad_norm": 0.6497631072998047, "kl": 0.07949826866388321, "learning_rate": 4.627027773560129e-06, "loss": 0.0032, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1034 }, { "completion_length": 155.25, "epoch": 0.25875, "grad_norm": 0.8581832051277161, "kl": 0.0458100289106369, "learning_rate": 4.625880562488908e-06, "loss": 0.0018, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1035 }, { "completion_length": 181.75, "epoch": 0.259, "grad_norm": 0.6497436761856079, "kl": 0.04667607694864273, "learning_rate": 4.62473173246716e-06, "loss": 0.0019, "reward": 1.0625, "reward_std": 0.5403372049331665, "rewards/_accuracy_reward": 0.1875, "rewards/_format_reward": 0.875, "step": 1036 }, { "completion_length": 161.125, "epoch": 0.25925, "grad_norm": 0.6015608906745911, "kl": 0.047349605709314346, "learning_rate": 4.62358128436977e-06, "loss": 0.0019, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1037 }, { "completion_length": 183.75, "epoch": 0.2595, "grad_norm": 0.6029166579246521, "kl": 0.04961675405502319, "learning_rate": 4.622429219072854e-06, "loss": 0.002, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 1038 }, { "completion_length": 176.875, "epoch": 0.25975, "grad_norm": 0.5468695759773254, "kl": 0.049566950649023056, "learning_rate": 4.6212755374537596e-06, "loss": 0.002, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1039 }, { "completion_length": 90.125, "epoch": 0.26, "grad_norm": 0.7798323035240173, "kl": 0.0876527652144432, "learning_rate": 4.620120240391065e-06, "loss": 0.0035, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1040 }, { "completion_length": 199.375, "epoch": 0.26025, "grad_norm": 0.5108053088188171, "kl": 0.056546930223703384, "learning_rate": 4.61896332876458e-06, "loss": 0.0023, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1041 }, { "completion_length": 159.125, "epoch": 0.2605, "grad_norm": 0.6133120059967041, "kl": 0.06025753542780876, "learning_rate": 4.6178048034553435e-06, "loss": 0.0024, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1042 }, { "completion_length": 156.875, "epoch": 0.26075, "grad_norm": 0.6843867897987366, "kl": 0.04468563199043274, "learning_rate": 4.616644665345621e-06, "loss": 0.0018, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1043 }, { "completion_length": 193.875, "epoch": 0.261, "grad_norm": 0.034902796149253845, "kl": 0.05293627455830574, "learning_rate": 4.6154829153189105e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1044 }, { "completion_length": 221.25, "epoch": 0.26125, "grad_norm": 0.5196229219436646, "kl": 0.044894713908433914, "learning_rate": 4.614319554259934e-06, "loss": 0.0018, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1045 }, { "completion_length": 107.5, "epoch": 0.2615, "grad_norm": 0.021327383816242218, "kl": 0.04686171934008598, "learning_rate": 4.613154583054641e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1046 }, { "completion_length": 169.375, "epoch": 0.26175, "grad_norm": 0.029194172471761703, "kl": 0.06922190636396408, "learning_rate": 4.611988002590209e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1047 }, { "completion_length": 186.75, "epoch": 0.262, "grad_norm": 0.5589852333068848, "kl": 0.05440639704465866, "learning_rate": 4.610819813755038e-06, "loss": 0.0022, "reward": 0.90625, "reward_std": 0.6483150720596313, "rewards/_accuracy_reward": 0.15625, "rewards/_format_reward": 0.75, "step": 1048 }, { "completion_length": 181.625, "epoch": 0.26225, "grad_norm": 0.7409690022468567, "kl": 0.0589996799826622, "learning_rate": 4.609650017438757e-06, "loss": 0.0024, "reward": 1.3762500286102295, "reward_std": 0.915547251701355, "rewards/_accuracy_reward": 0.6262500286102295, "rewards/_format_reward": 0.75, "step": 1049 }, { "completion_length": 197.375, "epoch": 0.2625, "grad_norm": 0.7713769674301147, "kl": 0.05258062854409218, "learning_rate": 4.608478614532215e-06, "loss": 0.0021, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1050 }, { "completion_length": 100.0, "epoch": 0.26275, "grad_norm": 0.7785927653312683, "kl": 0.04665987938642502, "learning_rate": 4.6073056059274865e-06, "loss": 0.0019, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1051 }, { "completion_length": 200.125, "epoch": 0.263, "grad_norm": 0.6331179738044739, "kl": 0.052123308181762695, "learning_rate": 4.60613099251787e-06, "loss": 0.0021, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1052 }, { "completion_length": 109.75, "epoch": 0.26325, "grad_norm": 0.8628464937210083, "kl": 0.05367436632514, "learning_rate": 4.604954775197882e-06, "loss": 0.0021, "reward": 1.7575000524520874, "reward_std": 0.449150025844574, "rewards/_accuracy_reward": 0.7574999928474426, "rewards/_format_reward": 1.0, "step": 1053 }, { "completion_length": 184.75, "epoch": 0.2635, "grad_norm": 0.5951057076454163, "kl": 0.050312891602516174, "learning_rate": 4.603776954863266e-06, "loss": 0.002, "reward": 1.1875, "reward_std": 0.9519716501235962, "rewards/_accuracy_reward": 0.5625, "rewards/_format_reward": 0.625, "step": 1054 }, { "completion_length": 157.5, "epoch": 0.26375, "grad_norm": 0.8466554880142212, "kl": 0.04847400635480881, "learning_rate": 4.602597532410982e-06, "loss": 0.0019, "reward": 1.5499999523162842, "reward_std": 0.4855042099952698, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 1055 }, { "completion_length": 175.125, "epoch": 0.264, "grad_norm": 0.5892555713653564, "kl": 0.05255506560206413, "learning_rate": 4.601416508739211e-06, "loss": 0.0021, "reward": 1.6887500286102295, "reward_std": 0.43590423464775085, "rewards/_accuracy_reward": 0.6887500286102295, "rewards/_format_reward": 1.0, "step": 1056 }, { "completion_length": 133.125, "epoch": 0.26425, "grad_norm": 0.026735153049230576, "kl": 0.04194887727499008, "learning_rate": 4.600233884747355e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1057 }, { "completion_length": 113.5, "epoch": 0.2645, "grad_norm": 0.5905328989028931, "kl": 0.0982663631439209, "learning_rate": 4.599049661336033e-06, "loss": 0.0039, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1058 }, { "completion_length": 120.75, "epoch": 0.26475, "grad_norm": 1.0292084217071533, "kl": 0.0494346097111702, "learning_rate": 4.5978638394070835e-06, "loss": 0.002, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 1059 }, { "completion_length": 183.25, "epoch": 0.265, "grad_norm": 0.6092599630355835, "kl": 0.060696642845869064, "learning_rate": 4.596676419863561e-06, "loss": 0.0024, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1060 }, { "completion_length": 155.75, "epoch": 0.26525, "grad_norm": 0.7234264612197876, "kl": 0.05358058586716652, "learning_rate": 4.595487403609736e-06, "loss": 0.0021, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1061 }, { "completion_length": 180.75, "epoch": 0.2655, "grad_norm": 0.5621580481529236, "kl": 0.06479740887880325, "learning_rate": 4.5942967915510975e-06, "loss": 0.0026, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1062 }, { "completion_length": 107.5, "epoch": 0.26575, "grad_norm": 0.46496903896331787, "kl": 0.037450678646564484, "learning_rate": 4.593104584594348e-06, "loss": 0.0015, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1063 }, { "completion_length": 114.25, "epoch": 0.266, "grad_norm": 0.7852677702903748, "kl": 0.0679827630519867, "learning_rate": 4.591910783647405e-06, "loss": 0.0027, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1064 }, { "completion_length": 181.75, "epoch": 0.26625, "grad_norm": 0.5932605862617493, "kl": 0.057104308158159256, "learning_rate": 4.590715389619399e-06, "loss": 0.0023, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1065 }, { "completion_length": 123.75, "epoch": 0.2665, "grad_norm": 0.6589471697807312, "kl": 0.04012312740087509, "learning_rate": 4.589518403420676e-06, "loss": 0.0016, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1066 }, { "completion_length": 68.375, "epoch": 0.26675, "grad_norm": 0.8799357414245605, "kl": 0.0480550192296505, "learning_rate": 4.588319825962793e-06, "loss": 0.0019, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1067 }, { "completion_length": 146.625, "epoch": 0.267, "grad_norm": 0.6732004284858704, "kl": 0.045358166098594666, "learning_rate": 4.587119658158517e-06, "loss": 0.0018, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.875, "step": 1068 }, { "completion_length": 183.875, "epoch": 0.26725, "grad_norm": 0.040793102234601974, "kl": 0.07178792357444763, "learning_rate": 4.58591790092183e-06, "loss": 0.0029, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1069 }, { "completion_length": 132.5, "epoch": 0.2675, "grad_norm": 0.04739204794168472, "kl": 0.06881429255008698, "learning_rate": 4.584714555167921e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1070 }, { "completion_length": 156.75, "epoch": 0.26775, "grad_norm": 0.8715057969093323, "kl": 0.08568653464317322, "learning_rate": 4.583509621813192e-06, "loss": 0.0034, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1071 }, { "completion_length": 141.0, "epoch": 0.268, "grad_norm": 0.0545993335545063, "kl": 0.07613251358270645, "learning_rate": 4.582303101775249e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1072 }, { "completion_length": 154.25, "epoch": 0.26825, "grad_norm": 0.7691239714622498, "kl": 0.07986550033092499, "learning_rate": 4.581094995972912e-06, "loss": 0.0032, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1073 }, { "completion_length": 166.75, "epoch": 0.2685, "grad_norm": 0.5310074687004089, "kl": 0.05323049798607826, "learning_rate": 4.579885305326206e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1074 }, { "completion_length": 172.625, "epoch": 0.26875, "grad_norm": 0.7366254329681396, "kl": 0.060853827744722366, "learning_rate": 4.578674030756364e-06, "loss": 0.0024, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 1075 }, { "completion_length": 157.625, "epoch": 0.269, "grad_norm": 0.5994381904602051, "kl": 0.05490027368068695, "learning_rate": 4.577461173185821e-06, "loss": 0.0022, "reward": 1.0749999284744263, "reward_std": 0.0707106813788414, "rewards/_accuracy_reward": 0.07500000298023224, "rewards/_format_reward": 1.0, "step": 1076 }, { "completion_length": 160.375, "epoch": 0.26925, "grad_norm": 0.023495573550462723, "kl": 0.04241487383842468, "learning_rate": 4.576246733538223e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1077 }, { "completion_length": 182.625, "epoch": 0.2695, "grad_norm": 0.6254801750183105, "kl": 0.055535938590765, "learning_rate": 4.5750307127384194e-06, "loss": 0.0022, "reward": 1.1687499284744263, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 1.0, "step": 1078 }, { "completion_length": 116.125, "epoch": 0.26975, "grad_norm": 0.04201361909508705, "kl": 0.07124117761850357, "learning_rate": 4.5738131117124605e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1079 }, { "completion_length": 122.875, "epoch": 0.27, "grad_norm": 0.8643020391464233, "kl": 0.06395326554775238, "learning_rate": 4.572593931387604e-06, "loss": 0.0026, "reward": 1.7825000286102295, "reward_std": 0.40780770778656006, "rewards/_accuracy_reward": 0.7825000286102295, "rewards/_format_reward": 1.0, "step": 1080 }, { "completion_length": 146.375, "epoch": 0.27025, "grad_norm": 0.019885435700416565, "kl": 0.038068097084760666, "learning_rate": 4.571373172692309e-06, "loss": 0.0015, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1081 }, { "completion_length": 123.625, "epoch": 0.2705, "grad_norm": 1.255374550819397, "kl": 0.05284246429800987, "learning_rate": 4.570150836556236e-06, "loss": 0.0021, "reward": 1.1937499046325684, "reward_std": 0.33320683240890503, "rewards/_accuracy_reward": 0.19375000894069672, "rewards/_format_reward": 1.0, "step": 1082 }, { "completion_length": 162.5, "epoch": 0.27075, "grad_norm": 0.6360936760902405, "kl": 0.08327899128198624, "learning_rate": 4.568926923910248e-06, "loss": 0.0033, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1083 }, { "completion_length": 152.75, "epoch": 0.271, "grad_norm": 0.7469035387039185, "kl": 0.048695940524339676, "learning_rate": 4.567701435686405e-06, "loss": 0.0019, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1084 }, { "completion_length": 153.625, "epoch": 0.27125, "grad_norm": 0.05639031156897545, "kl": 0.06517668068408966, "learning_rate": 4.566474372817971e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1085 }, { "completion_length": 117.75, "epoch": 0.2715, "grad_norm": 0.9581111669540405, "kl": 0.05397922918200493, "learning_rate": 4.5652457362394094e-06, "loss": 0.0022, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1086 }, { "completion_length": 147.625, "epoch": 0.27175, "grad_norm": 0.7434791922569275, "kl": 0.07150553166866302, "learning_rate": 4.56401552688638e-06, "loss": 0.0029, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.28749996423721313, "rewards/_format_reward": 1.0, "step": 1087 }, { "completion_length": 138.25, "epoch": 0.272, "grad_norm": 0.8051536083221436, "kl": 0.05290424823760986, "learning_rate": 4.562783745695738e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1088 }, { "completion_length": 130.5, "epoch": 0.27225, "grad_norm": 0.05256952345371246, "kl": 0.07315313816070557, "learning_rate": 4.561550393605541e-06, "loss": 0.0029, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1089 }, { "completion_length": 154.75, "epoch": 0.2725, "grad_norm": 0.03965034335851669, "kl": 0.07297085970640182, "learning_rate": 4.560315471555039e-06, "loss": 0.0029, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1090 }, { "completion_length": 151.375, "epoch": 0.27275, "grad_norm": 0.6053784489631653, "kl": 0.055059581995010376, "learning_rate": 4.55907898048468e-06, "loss": 0.0022, "reward": 1.431249976158142, "reward_std": 0.47579824924468994, "rewards/_accuracy_reward": 0.4312499761581421, "rewards/_format_reward": 1.0, "step": 1091 }, { "completion_length": 139.125, "epoch": 0.273, "grad_norm": 0.8535023331642151, "kl": 0.037554092705249786, "learning_rate": 4.5578409213361055e-06, "loss": 0.0015, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1092 }, { "completion_length": 159.375, "epoch": 0.27325, "grad_norm": 0.03497467190027237, "kl": 0.0712779238820076, "learning_rate": 4.55660129505215e-06, "loss": 0.0029, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1093 }, { "completion_length": 112.375, "epoch": 0.2735, "grad_norm": 0.7117050290107727, "kl": 0.05563116446137428, "learning_rate": 4.555360102576844e-06, "loss": 0.0022, "reward": 1.431249976158142, "reward_std": 0.47579821944236755, "rewards/_accuracy_reward": 0.4312499761581421, "rewards/_format_reward": 1.0, "step": 1094 }, { "completion_length": 167.375, "epoch": 0.27375, "grad_norm": 0.5020763874053955, "kl": 0.050325650721788406, "learning_rate": 4.55411734485541e-06, "loss": 0.002, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1095 }, { "completion_length": 133.125, "epoch": 0.274, "grad_norm": 0.718513011932373, "kl": 0.05303411930799484, "learning_rate": 4.55287302283426e-06, "loss": 0.0021, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1096 }, { "completion_length": 189.375, "epoch": 0.27425, "grad_norm": 0.608174741268158, "kl": 0.05475342273712158, "learning_rate": 4.551627137461002e-06, "loss": 0.0022, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1097 }, { "completion_length": 158.25, "epoch": 0.2745, "grad_norm": 0.6787841320037842, "kl": 0.05620495602488518, "learning_rate": 4.550379689684431e-06, "loss": 0.0022, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1098 }, { "completion_length": 146.375, "epoch": 0.27475, "grad_norm": 0.5872085690498352, "kl": 0.041002292186021805, "learning_rate": 4.549130680454532e-06, "loss": 0.0016, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1099 }, { "completion_length": 161.5, "epoch": 0.275, "grad_norm": 0.588703453540802, "kl": 0.055859677493572235, "learning_rate": 4.54788011072248e-06, "loss": 0.0022, "reward": 1.6687500476837158, "reward_std": 0.4613160789012909, "rewards/_accuracy_reward": 0.668749988079071, "rewards/_format_reward": 1.0, "step": 1100 }, { "completion_length": 170.5, "epoch": 0.27525, "grad_norm": 0.5734580159187317, "kl": 0.06272434443235397, "learning_rate": 4.546627981440639e-06, "loss": 0.0025, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1101 }, { "completion_length": 160.875, "epoch": 0.2755, "grad_norm": 0.6104035973548889, "kl": 0.0559980645775795, "learning_rate": 4.545374293562559e-06, "loss": 0.0022, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.6375000476837158, "rewards/_format_reward": 0.875, "step": 1102 }, { "completion_length": 177.125, "epoch": 0.27575, "grad_norm": 0.5717198252677917, "kl": 0.09279928356409073, "learning_rate": 4.544119048042978e-06, "loss": 0.0037, "reward": 1.6875, "reward_std": 0.4381372928619385, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 0.875, "step": 1103 }, { "completion_length": 196.625, "epoch": 0.276, "grad_norm": 0.5511927604675293, "kl": 0.03467211127281189, "learning_rate": 4.542862245837821e-06, "loss": 0.0014, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.625, "step": 1104 }, { "completion_length": 155.625, "epoch": 0.27625, "grad_norm": 0.6550043821334839, "kl": 0.04956316575407982, "learning_rate": 4.541603887904198e-06, "loss": 0.002, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1105 }, { "completion_length": 163.875, "epoch": 0.2765, "grad_norm": 0.4754960834980011, "kl": 0.046637773513793945, "learning_rate": 4.540343975200401e-06, "loss": 0.0019, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1106 }, { "completion_length": 172.25, "epoch": 0.27675, "grad_norm": 0.031275276094675064, "kl": 0.04974092170596123, "learning_rate": 4.5390825086859094e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1107 }, { "completion_length": 149.75, "epoch": 0.277, "grad_norm": 0.5498752593994141, "kl": 0.04549973085522652, "learning_rate": 4.537819489321385e-06, "loss": 0.0018, "reward": 1.4200000762939453, "reward_std": 0.7127813100814819, "rewards/_accuracy_reward": 0.5449999570846558, "rewards/_format_reward": 0.875, "step": 1108 }, { "completion_length": 152.5, "epoch": 0.27725, "grad_norm": 0.6540334224700928, "kl": 0.03749295696616173, "learning_rate": 4.536554918068673e-06, "loss": 0.0015, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1109 }, { "completion_length": 155.125, "epoch": 0.2775, "grad_norm": 0.6249139904975891, "kl": 0.052198849618434906, "learning_rate": 4.535288795890799e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1110 }, { "completion_length": 162.75, "epoch": 0.27775, "grad_norm": 0.5957991480827332, "kl": 0.0481363981962204, "learning_rate": 4.5340211237519685e-06, "loss": 0.0019, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1111 }, { "completion_length": 84.5, "epoch": 0.278, "grad_norm": 0.970755398273468, "kl": 0.029281822964549065, "learning_rate": 4.5327519026175694e-06, "loss": 0.0012, "reward": 1.7575000524520874, "reward_std": 0.449150025844574, "rewards/_accuracy_reward": 0.7575000524520874, "rewards/_format_reward": 1.0, "step": 1112 }, { "completion_length": 160.5, "epoch": 0.27825, "grad_norm": 0.5828734040260315, "kl": 0.05844755843281746, "learning_rate": 4.5314811334541695e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1113 }, { "completion_length": 180.625, "epoch": 0.2785, "grad_norm": 0.502030611038208, "kl": 0.05643211305141449, "learning_rate": 4.530208817229516e-06, "loss": 0.0023, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1114 }, { "completion_length": 120.375, "epoch": 0.27875, "grad_norm": 0.7388814091682434, "kl": 0.038211237639188766, "learning_rate": 4.528934954912531e-06, "loss": 0.0015, "reward": 1.5499999523162842, "reward_std": 0.4855041801929474, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 1115 }, { "completion_length": 203.375, "epoch": 0.279, "grad_norm": 0.4916674494743347, "kl": 0.050826288759708405, "learning_rate": 4.527659547473317e-06, "loss": 0.002, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1116 }, { "completion_length": 111.625, "epoch": 0.27925, "grad_norm": 0.6236878037452698, "kl": 0.06385045498609543, "learning_rate": 4.526382595883152e-06, "loss": 0.0026, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1117 }, { "completion_length": 169.125, "epoch": 0.2795, "grad_norm": 0.5840001702308655, "kl": 0.05632218345999718, "learning_rate": 4.5251041011144905e-06, "loss": 0.0023, "reward": 1.6262500286102295, "reward_std": 0.7428312301635742, "rewards/_accuracy_reward": 0.7512500286102295, "rewards/_format_reward": 0.875, "step": 1118 }, { "completion_length": 130.0, "epoch": 0.27975, "grad_norm": 0.8513467311859131, "kl": 0.08324826508760452, "learning_rate": 4.523824064140961e-06, "loss": 0.0033, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1119 }, { "completion_length": 149.75, "epoch": 0.28, "grad_norm": 0.480070024728775, "kl": 0.039729684591293335, "learning_rate": 4.522542485937369e-06, "loss": 0.0016, "reward": 1.5012500286102295, "reward_std": 0.923509418964386, "rewards/_accuracy_reward": 0.7512500286102295, "rewards/_format_reward": 0.75, "step": 1120 }, { "completion_length": 187.0, "epoch": 0.28025, "grad_norm": 0.5705462694168091, "kl": 0.05749392881989479, "learning_rate": 4.521259367479691e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1121 }, { "completion_length": 92.5, "epoch": 0.2805, "grad_norm": 0.5976485013961792, "kl": 0.06839491426944733, "learning_rate": 4.519974709745076e-06, "loss": 0.0027, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1122 }, { "completion_length": 145.75, "epoch": 0.28075, "grad_norm": 0.46838706731796265, "kl": 0.0469730868935585, "learning_rate": 4.51868851371185e-06, "loss": 0.0019, "reward": 1.1687499284744263, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 1.0, "step": 1123 }, { "completion_length": 138.625, "epoch": 0.281, "grad_norm": 0.02328825183212757, "kl": 0.045329801738262177, "learning_rate": 4.517400780359505e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1124 }, { "completion_length": 123.0, "epoch": 0.28125, "grad_norm": 0.023147309198975563, "kl": 0.05815961956977844, "learning_rate": 4.516111510668707e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1125 }, { "completion_length": 176.125, "epoch": 0.2815, "grad_norm": 0.5607932806015015, "kl": 0.052996277809143066, "learning_rate": 4.51482070562129e-06, "loss": 0.0021, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1126 }, { "completion_length": 89.625, "epoch": 0.28175, "grad_norm": 0.023232001811265945, "kl": 0.04320669546723366, "learning_rate": 4.513528366200258e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1127 }, { "completion_length": 160.875, "epoch": 0.282, "grad_norm": 0.5776987671852112, "kl": 0.05144071206450462, "learning_rate": 4.512234493389785e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1128 }, { "completion_length": 172.75, "epoch": 0.28225, "grad_norm": 0.5175846219062805, "kl": 0.05581043288111687, "learning_rate": 4.510939088175211e-06, "loss": 0.0022, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 1129 }, { "completion_length": 145.375, "epoch": 0.2825, "grad_norm": 0.6114826202392578, "kl": 0.05112025886774063, "learning_rate": 4.509642151543043e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1130 }, { "completion_length": 102.75, "epoch": 0.28275, "grad_norm": 0.6514590978622437, "kl": 0.04705269634723663, "learning_rate": 4.508343684480956e-06, "loss": 0.0019, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1131 }, { "completion_length": 158.75, "epoch": 0.283, "grad_norm": 0.608859658241272, "kl": 0.1234949603676796, "learning_rate": 4.507043687977787e-06, "loss": 0.0049, "reward": 1.443750023841858, "reward_std": 0.7022603750228882, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 0.75, "step": 1132 }, { "completion_length": 149.875, "epoch": 0.28325, "grad_norm": 0.8146029710769653, "kl": 0.056399088352918625, "learning_rate": 4.505742163023541e-06, "loss": 0.0023, "reward": 1.0437499284744263, "reward_std": 0.5212878584861755, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 0.875, "step": 1133 }, { "completion_length": 207.125, "epoch": 0.2835, "grad_norm": 0.5459649562835693, "kl": 0.06084807217121124, "learning_rate": 4.504439110609385e-06, "loss": 0.0024, "reward": 1.28125, "reward_std": 0.8807210922241211, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.625, "step": 1134 }, { "completion_length": 151.375, "epoch": 0.28375, "grad_norm": 0.7785174250602722, "kl": 0.05954066291451454, "learning_rate": 4.503134531727652e-06, "loss": 0.0024, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1135 }, { "completion_length": 152.0, "epoch": 0.284, "grad_norm": 0.05124456435441971, "kl": 0.04783984273672104, "learning_rate": 4.501828427371834e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1136 }, { "completion_length": 168.875, "epoch": 0.28425, "grad_norm": 0.027116047218441963, "kl": 0.0674908459186554, "learning_rate": 4.5005207985365875e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1137 }, { "completion_length": 156.75, "epoch": 0.2845, "grad_norm": 0.7124760150909424, "kl": 0.06690853834152222, "learning_rate": 4.4992116462177274e-06, "loss": 0.0027, "reward": 1.756250023841858, "reward_std": 0.45153507590293884, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 1138 }, { "completion_length": 181.75, "epoch": 0.28475, "grad_norm": 0.5964755415916443, "kl": 0.06595727056264877, "learning_rate": 4.49790097141223e-06, "loss": 0.0026, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1139 }, { "completion_length": 156.25, "epoch": 0.285, "grad_norm": 0.8890787959098816, "kl": 0.07360620051622391, "learning_rate": 4.496588775118232e-06, "loss": 0.0029, "reward": 1.631250023841858, "reward_std": 0.7382108569145203, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 1140 }, { "completion_length": 157.5, "epoch": 0.28525, "grad_norm": 0.8517801761627197, "kl": 0.0814465880393982, "learning_rate": 4.495275058335029e-06, "loss": 0.0033, "reward": 1.4187500476837158, "reward_std": 0.7235515117645264, "rewards/_accuracy_reward": 0.543749988079071, "rewards/_format_reward": 0.875, "step": 1141 }, { "completion_length": 171.125, "epoch": 0.2855, "grad_norm": 0.5479044914245605, "kl": 0.04513910040259361, "learning_rate": 4.4939598220630724e-06, "loss": 0.0018, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.6375000476837158, "rewards/_format_reward": 0.875, "step": 1142 }, { "completion_length": 164.0, "epoch": 0.28575, "grad_norm": 0.05868508666753769, "kl": 0.06565727293491364, "learning_rate": 4.49264306730397e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1143 }, { "completion_length": 155.75, "epoch": 0.286, "grad_norm": 0.7048906087875366, "kl": 0.04985063150525093, "learning_rate": 4.491324795060491e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1144 }, { "completion_length": 129.625, "epoch": 0.28625, "grad_norm": 0.017044005915522575, "kl": 0.04838981479406357, "learning_rate": 4.490005006336555e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1145 }, { "completion_length": 131.375, "epoch": 0.2865, "grad_norm": 0.5756621360778809, "kl": 0.05202525854110718, "learning_rate": 4.48868370213724e-06, "loss": 0.0021, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1146 }, { "completion_length": 152.375, "epoch": 0.28675, "grad_norm": 1.1808115243911743, "kl": 0.06879065185785294, "learning_rate": 4.487360883468775e-06, "loss": 0.0028, "reward": 1.3125, "reward_std": 0.4299086630344391, "rewards/_accuracy_reward": 0.3124999701976776, "rewards/_format_reward": 1.0, "step": 1147 }, { "completion_length": 175.125, "epoch": 0.287, "grad_norm": 0.7814071774482727, "kl": 0.06425180286169052, "learning_rate": 4.4860365513385456e-06, "loss": 0.0026, "reward": 1.46875, "reward_std": 0.6999680995941162, "rewards/_accuracy_reward": 0.59375, "rewards/_format_reward": 0.875, "step": 1148 }, { "completion_length": 115.75, "epoch": 0.28725, "grad_norm": 0.8087677955627441, "kl": 0.04324078559875488, "learning_rate": 4.484710706755087e-06, "loss": 0.0017, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1149 }, { "completion_length": 144.375, "epoch": 0.2875, "grad_norm": 0.9671500325202942, "kl": 0.09105661511421204, "learning_rate": 4.4833833507280884e-06, "loss": 0.0036, "reward": 1.5149999856948853, "reward_std": 0.5187072157859802, "rewards/_accuracy_reward": 0.5149999856948853, "rewards/_format_reward": 1.0, "step": 1150 }, { "completion_length": 156.375, "epoch": 0.28775, "grad_norm": 0.7499131560325623, "kl": 0.07455121725797653, "learning_rate": 4.482054484268389e-06, "loss": 0.003, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 1151 }, { "completion_length": 144.75, "epoch": 0.288, "grad_norm": 0.08447606861591339, "kl": 0.10362078249454498, "learning_rate": 4.4807241083879774e-06, "loss": 0.0041, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1152 }, { "completion_length": 96.125, "epoch": 0.28825, "grad_norm": 0.8950498700141907, "kl": 0.10107363015413284, "learning_rate": 4.4793922240999935e-06, "loss": 0.004, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1153 }, { "completion_length": 122.125, "epoch": 0.2885, "grad_norm": 1.3117358684539795, "kl": 0.09728724509477615, "learning_rate": 4.478058832418726e-06, "loss": 0.0039, "reward": 1.5499999523162842, "reward_std": 0.4855041801929474, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 1154 }, { "completion_length": 167.0, "epoch": 0.28875, "grad_norm": 0.6316139101982117, "kl": 0.08301078528165817, "learning_rate": 4.476723934359609e-06, "loss": 0.0033, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.643750011920929, "rewards/_format_reward": 1.0, "step": 1155 }, { "completion_length": 130.875, "epoch": 0.289, "grad_norm": 0.7130420804023743, "kl": 0.08175483345985413, "learning_rate": 4.475387530939226e-06, "loss": 0.0033, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1156 }, { "completion_length": 135.5, "epoch": 0.28925, "grad_norm": 0.06840886920690536, "kl": 0.07843038439750671, "learning_rate": 4.474049623175307e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1157 }, { "completion_length": 105.875, "epoch": 0.2895, "grad_norm": 0.633524477481842, "kl": 0.07753744721412659, "learning_rate": 4.4727102120867274e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1158 }, { "completion_length": 168.5, "epoch": 0.28975, "grad_norm": 0.808603048324585, "kl": 0.08730296045541763, "learning_rate": 4.471369298693505e-06, "loss": 0.0035, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1159 }, { "completion_length": 131.75, "epoch": 0.29, "grad_norm": 0.039577484130859375, "kl": 0.06506015360355377, "learning_rate": 4.470026884016805e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1160 }, { "completion_length": 107.25, "epoch": 0.29025, "grad_norm": 0.7665241360664368, "kl": 0.09168189764022827, "learning_rate": 4.468682969078935e-06, "loss": 0.0037, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1161 }, { "completion_length": 148.875, "epoch": 0.2905, "grad_norm": 0.6652460694313049, "kl": 0.05769990384578705, "learning_rate": 4.467337554903344e-06, "loss": 0.0023, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 1162 }, { "completion_length": 180.5, "epoch": 0.29075, "grad_norm": 0.6770204305648804, "kl": 0.0662575513124466, "learning_rate": 4.465990642514622e-06, "loss": 0.0027, "reward": 1.537500023841858, "reward_std": 0.7322909235954285, "rewards/_accuracy_reward": 0.6625000238418579, "rewards/_format_reward": 0.875, "step": 1163 }, { "completion_length": 110.0, "epoch": 0.291, "grad_norm": 0.037855084985494614, "kl": 0.06914416700601578, "learning_rate": 4.464642232938505e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1164 }, { "completion_length": 155.125, "epoch": 0.29125, "grad_norm": 0.48786526918411255, "kl": 0.05076320096850395, "learning_rate": 4.463292327201862e-06, "loss": 0.002, "reward": 1.1837499141693115, "reward_std": 0.3386079967021942, "rewards/_accuracy_reward": 0.1837499886751175, "rewards/_format_reward": 1.0, "step": 1165 }, { "completion_length": 75.375, "epoch": 0.2915, "grad_norm": 0.994990348815918, "kl": 0.09153227508068085, "learning_rate": 4.461940926332708e-06, "loss": 0.0037, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1166 }, { "completion_length": 109.625, "epoch": 0.29175, "grad_norm": 0.021669652312994003, "kl": 0.04759254679083824, "learning_rate": 4.460588031360191e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1167 }, { "completion_length": 93.875, "epoch": 0.292, "grad_norm": 0.8044827580451965, "kl": 0.03482041880488396, "learning_rate": 4.4592336433146e-06, "loss": 0.0014, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1168 }, { "completion_length": 166.125, "epoch": 0.29225, "grad_norm": 0.6489118933677673, "kl": 0.06921645253896713, "learning_rate": 4.457877763227361e-06, "loss": 0.0028, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 1.0, "step": 1169 }, { "completion_length": 168.75, "epoch": 0.2925, "grad_norm": 0.5671146512031555, "kl": 0.06925438344478607, "learning_rate": 4.456520392131035e-06, "loss": 0.0028, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.518750011920929, "rewards/_format_reward": 0.875, "step": 1170 }, { "completion_length": 176.5, "epoch": 0.29275, "grad_norm": 0.7186687588691711, "kl": 0.08284782618284225, "learning_rate": 4.45516153105932e-06, "loss": 0.0033, "reward": 1.28125, "reward_std": 0.44395747780799866, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 0.875, "step": 1171 }, { "completion_length": 140.0, "epoch": 0.293, "grad_norm": 0.5549473762512207, "kl": 0.05967408046126366, "learning_rate": 4.453801181047047e-06, "loss": 0.0024, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1172 }, { "completion_length": 160.625, "epoch": 0.29325, "grad_norm": 0.593147337436676, "kl": 0.062421780079603195, "learning_rate": 4.452439343130183e-06, "loss": 0.0025, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1173 }, { "completion_length": 135.125, "epoch": 0.2935, "grad_norm": 0.03139398992061615, "kl": 0.05725998803973198, "learning_rate": 4.4510760183458246e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1174 }, { "completion_length": 136.875, "epoch": 0.29375, "grad_norm": 0.035139165818691254, "kl": 0.08669183403253555, "learning_rate": 4.4497112077322045e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1175 }, { "completion_length": 93.125, "epoch": 0.294, "grad_norm": 0.03356340900063515, "kl": 0.10462416708469391, "learning_rate": 4.448344912328686e-06, "loss": 0.0042, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1176 }, { "completion_length": 174.5, "epoch": 0.29425, "grad_norm": 0.6406842470169067, "kl": 0.10782821476459503, "learning_rate": 4.446977133175761e-06, "loss": 0.0043, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 1177 }, { "completion_length": 180.125, "epoch": 0.2945, "grad_norm": 0.6326098442077637, "kl": 0.0835682675242424, "learning_rate": 4.445607871315053e-06, "loss": 0.0033, "reward": 1.1437499523162842, "reward_std": 0.8317097425460815, "rewards/_accuracy_reward": 0.39374998211860657, "rewards/_format_reward": 0.75, "step": 1178 }, { "completion_length": 96.625, "epoch": 0.29475, "grad_norm": 0.7805556058883667, "kl": 0.03996328264474869, "learning_rate": 4.444237127789315e-06, "loss": 0.0016, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 1.0, "step": 1179 }, { "completion_length": 133.375, "epoch": 0.295, "grad_norm": 0.02025960385799408, "kl": 0.059164561331272125, "learning_rate": 4.442864903642428e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1180 }, { "completion_length": 147.875, "epoch": 0.29525, "grad_norm": 0.03799540922045708, "kl": 0.06712915748357773, "learning_rate": 4.4414911999194e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1181 }, { "completion_length": 164.25, "epoch": 0.2955, "grad_norm": 0.6490309238433838, "kl": 0.07048535346984863, "learning_rate": 4.440116017666365e-06, "loss": 0.0028, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1182 }, { "completion_length": 136.0, "epoch": 0.29575, "grad_norm": 0.6978442072868347, "kl": 0.06952231377363205, "learning_rate": 4.438739357930587e-06, "loss": 0.0028, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1183 }, { "completion_length": 98.125, "epoch": 0.296, "grad_norm": 1.5878691673278809, "kl": 0.04807111620903015, "learning_rate": 4.437361221760449e-06, "loss": 0.0019, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1184 }, { "completion_length": 166.0, "epoch": 0.29625, "grad_norm": 0.5427411198616028, "kl": 0.07635194063186646, "learning_rate": 4.435981610205464e-06, "loss": 0.0031, "reward": 0.9699999094009399, "reward_std": 0.3982820212841034, "rewards/_accuracy_reward": 0.0949999988079071, "rewards/_format_reward": 0.875, "step": 1185 }, { "completion_length": 140.625, "epoch": 0.2965, "grad_norm": 0.6156898140907288, "kl": 0.06197688356041908, "learning_rate": 4.434600524316266e-06, "loss": 0.0025, "reward": 1.5199999809265137, "reward_std": 0.5133086442947388, "rewards/_accuracy_reward": 0.5199999809265137, "rewards/_format_reward": 1.0, "step": 1186 }, { "completion_length": 144.0, "epoch": 0.29675, "grad_norm": 0.6569059491157532, "kl": 0.05045042932033539, "learning_rate": 4.4332179651446106e-06, "loss": 0.002, "reward": 1.6325000524520874, "reward_std": 0.507395327091217, "rewards/_accuracy_reward": 0.7574999928474426, "rewards/_format_reward": 0.875, "step": 1187 }, { "completion_length": 164.5, "epoch": 0.297, "grad_norm": 0.7192619442939758, "kl": 0.07004435360431671, "learning_rate": 4.431833933743378e-06, "loss": 0.0028, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1188 }, { "completion_length": 160.125, "epoch": 0.29725, "grad_norm": 0.7192215323448181, "kl": 0.05854785814881325, "learning_rate": 4.430448431166567e-06, "loss": 0.0023, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.40625, "rewards/_format_reward": 1.0, "step": 1189 }, { "completion_length": 148.375, "epoch": 0.2975, "grad_norm": 0.6885538101196289, "kl": 0.07502228021621704, "learning_rate": 4.4290614584693005e-06, "loss": 0.003, "reward": 1.5499999523162842, "reward_std": 0.4855041801929474, "rewards/_accuracy_reward": 0.5499999523162842, "rewards/_format_reward": 1.0, "step": 1190 }, { "completion_length": 93.125, "epoch": 0.29775, "grad_norm": 0.816205620765686, "kl": 0.08329864591360092, "learning_rate": 4.427673016707817e-06, "loss": 0.0033, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1191 }, { "completion_length": 160.25, "epoch": 0.298, "grad_norm": 0.7627013921737671, "kl": 0.0577714703977108, "learning_rate": 4.426283106939474e-06, "loss": 0.0023, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1192 }, { "completion_length": 135.75, "epoch": 0.29825, "grad_norm": 0.6774864196777344, "kl": 0.05635792762041092, "learning_rate": 4.424891730222749e-06, "loss": 0.0023, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1193 }, { "completion_length": 113.625, "epoch": 0.2985, "grad_norm": 0.7691601514816284, "kl": 0.0638766884803772, "learning_rate": 4.423498887617238e-06, "loss": 0.0026, "reward": 1.4375, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.4375, "rewards/_format_reward": 1.0, "step": 1194 }, { "completion_length": 115.0, "epoch": 0.29875, "grad_norm": 0.9962632060050964, "kl": 0.09802590310573578, "learning_rate": 4.422104580183649e-06, "loss": 0.0039, "reward": 1.7575000524520874, "reward_std": 0.449150025844574, "rewards/_accuracy_reward": 0.7575000524520874, "rewards/_format_reward": 1.0, "step": 1195 }, { "completion_length": 132.125, "epoch": 0.299, "grad_norm": 0.8800735473632812, "kl": 0.05530092492699623, "learning_rate": 4.420708808983809e-06, "loss": 0.0022, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1196 }, { "completion_length": 129.0, "epoch": 0.29925, "grad_norm": 0.02906613051891327, "kl": 0.08473115414381027, "learning_rate": 4.419311575080657e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1197 }, { "completion_length": 194.375, "epoch": 0.2995, "grad_norm": 0.5856153964996338, "kl": 0.06350870430469513, "learning_rate": 4.41791287953825e-06, "loss": 0.0025, "reward": 1.431249976158142, "reward_std": 0.47579821944236755, "rewards/_accuracy_reward": 0.4312500059604645, "rewards/_format_reward": 1.0, "step": 1198 }, { "completion_length": 159.0, "epoch": 0.29975, "grad_norm": 0.03134987875819206, "kl": 0.06507380306720734, "learning_rate": 4.416512723421752e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1199 }, { "completion_length": 124.125, "epoch": 0.3, "grad_norm": 0.0544891431927681, "kl": 0.07167188823223114, "learning_rate": 4.415111107797445e-06, "loss": 0.0029, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1200 }, { "completion_length": 140.25, "epoch": 0.30025, "grad_norm": 0.032767925411462784, "kl": 0.07854177057743073, "learning_rate": 4.413708033732721e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1201 }, { "completion_length": 97.0, "epoch": 0.3005, "grad_norm": 0.02442978322505951, "kl": 0.03377045691013336, "learning_rate": 4.412303502296081e-06, "loss": 0.0014, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1202 }, { "completion_length": 156.375, "epoch": 0.30075, "grad_norm": 0.8720740079879761, "kl": 0.06999517232179642, "learning_rate": 4.410897514557134e-06, "loss": 0.0028, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1203 }, { "completion_length": 161.125, "epoch": 0.301, "grad_norm": 0.6562350988388062, "kl": 0.05543859675526619, "learning_rate": 4.409490071586606e-06, "loss": 0.0022, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 1204 }, { "completion_length": 158.75, "epoch": 0.30125, "grad_norm": 0.7255245447158813, "kl": 0.06354228407144547, "learning_rate": 4.408081174456322e-06, "loss": 0.0025, "reward": 1.3512500524520874, "reward_std": 0.6214140057563782, "rewards/_accuracy_reward": 0.4762499928474426, "rewards/_format_reward": 0.875, "step": 1205 }, { "completion_length": 152.625, "epoch": 0.3015, "grad_norm": 0.9874303936958313, "kl": 0.0944613516330719, "learning_rate": 4.406670824239221e-06, "loss": 0.0038, "reward": 1.600000023841858, "reward_std": 0.43260011076927185, "rewards/_accuracy_reward": 0.6000000238418579, "rewards/_format_reward": 1.0, "step": 1206 }, { "completion_length": 159.625, "epoch": 0.30175, "grad_norm": 0.12287536263465881, "kl": 0.07532623410224915, "learning_rate": 4.405259022009345e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1207 }, { "completion_length": 164.625, "epoch": 0.302, "grad_norm": 0.6047512888908386, "kl": 0.042720258235931396, "learning_rate": 4.403845768841842e-06, "loss": 0.0017, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1208 }, { "completion_length": 147.375, "epoch": 0.30225, "grad_norm": 0.718567967414856, "kl": 0.06836355477571487, "learning_rate": 4.402431065812968e-06, "loss": 0.0027, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1209 }, { "completion_length": 133.125, "epoch": 0.3025, "grad_norm": 0.051449116319417953, "kl": 0.09786742180585861, "learning_rate": 4.401014914000078e-06, "loss": 0.0039, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1210 }, { "completion_length": 155.0, "epoch": 0.30275, "grad_norm": 0.6808714270591736, "kl": 0.06858290731906891, "learning_rate": 4.399597314481635e-06, "loss": 0.0027, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.875, "step": 1211 }, { "completion_length": 103.625, "epoch": 0.303, "grad_norm": 0.7550353407859802, "kl": 0.07095042616128922, "learning_rate": 4.398178268337202e-06, "loss": 0.0028, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1212 }, { "completion_length": 157.75, "epoch": 0.30325, "grad_norm": 0.03089936450123787, "kl": 0.07108917087316513, "learning_rate": 4.396757776647446e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1213 }, { "completion_length": 125.0, "epoch": 0.3035, "grad_norm": 0.6539469957351685, "kl": 0.05641159415245056, "learning_rate": 4.395335840494131e-06, "loss": 0.0023, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1214 }, { "completion_length": 131.5, "epoch": 0.30375, "grad_norm": 0.09180185198783875, "kl": 0.0818408653140068, "learning_rate": 4.393912460960125e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1215 }, { "completion_length": 117.0, "epoch": 0.304, "grad_norm": 1.0178251266479492, "kl": 0.14222905039787292, "learning_rate": 4.3924876391293915e-06, "loss": 0.0057, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1216 }, { "completion_length": 176.5, "epoch": 0.30425, "grad_norm": 0.5800394415855408, "kl": 0.07087651640176773, "learning_rate": 4.391061376086996e-06, "loss": 0.0028, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1217 }, { "completion_length": 153.875, "epoch": 0.3045, "grad_norm": 0.022167326882481575, "kl": 0.04559296742081642, "learning_rate": 4.389633672919099e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1218 }, { "completion_length": 143.5, "epoch": 0.30475, "grad_norm": 0.6811370253562927, "kl": 0.05372779816389084, "learning_rate": 4.388204530712959e-06, "loss": 0.0021, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1219 }, { "completion_length": 159.5, "epoch": 0.305, "grad_norm": 0.6724026203155518, "kl": 0.06683686375617981, "learning_rate": 4.386773950556931e-06, "loss": 0.0027, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1220 }, { "completion_length": 190.5, "epoch": 0.30525, "grad_norm": 0.4614053964614868, "kl": 0.0702865943312645, "learning_rate": 4.385341933540461e-06, "loss": 0.0028, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1221 }, { "completion_length": 199.875, "epoch": 0.3055, "grad_norm": 0.4716734290122986, "kl": 0.06874912232160568, "learning_rate": 4.3839084807540956e-06, "loss": 0.0027, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1222 }, { "completion_length": 116.5, "epoch": 0.30575, "grad_norm": 0.662601113319397, "kl": 0.047971662133932114, "learning_rate": 4.3824735932894695e-06, "loss": 0.0019, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1223 }, { "completion_length": 95.625, "epoch": 0.306, "grad_norm": 0.805237352848053, "kl": 0.037139102816581726, "learning_rate": 4.381037272239311e-06, "loss": 0.0015, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1224 }, { "completion_length": 182.875, "epoch": 0.30625, "grad_norm": 0.480247437953949, "kl": 0.06881922483444214, "learning_rate": 4.379599518697444e-06, "loss": 0.0028, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1225 }, { "completion_length": 81.0, "epoch": 0.3065, "grad_norm": 0.9215694665908813, "kl": 0.08191632479429245, "learning_rate": 4.378160333758779e-06, "loss": 0.0033, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 1226 }, { "completion_length": 178.875, "epoch": 0.30675, "grad_norm": 0.04747424274682999, "kl": 0.08109784126281738, "learning_rate": 4.3767197185193164e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1227 }, { "completion_length": 166.25, "epoch": 0.307, "grad_norm": 0.45579221844673157, "kl": 0.04330654814839363, "learning_rate": 4.3752776740761495e-06, "loss": 0.0017, "reward": 1.631250023841858, "reward_std": 0.7382108569145203, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 1228 }, { "completion_length": 167.625, "epoch": 0.30725, "grad_norm": 0.5388981699943542, "kl": 0.044501304626464844, "learning_rate": 4.373834201527457e-06, "loss": 0.0018, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.875, "step": 1229 }, { "completion_length": 142.375, "epoch": 0.3075, "grad_norm": 0.04394035413861275, "kl": 0.06808657199144363, "learning_rate": 4.372389301972506e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1230 }, { "completion_length": 145.625, "epoch": 0.30775, "grad_norm": 0.036082785576581955, "kl": 0.06343095749616623, "learning_rate": 4.370942976511651e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1231 }, { "completion_length": 109.875, "epoch": 0.308, "grad_norm": 0.6433164477348328, "kl": 0.05788550525903702, "learning_rate": 4.36949522624633e-06, "loss": 0.0023, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1232 }, { "completion_length": 137.875, "epoch": 0.30825, "grad_norm": 0.6208747625350952, "kl": 0.04600555822253227, "learning_rate": 4.36804605227907e-06, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1233 }, { "completion_length": 134.125, "epoch": 0.3085, "grad_norm": 0.0351361520588398, "kl": 0.07358551025390625, "learning_rate": 4.366595455713479e-06, "loss": 0.0029, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1234 }, { "completion_length": 83.75, "epoch": 0.30875, "grad_norm": 1.0054473876953125, "kl": 0.03810626268386841, "learning_rate": 4.365143437654249e-06, "loss": 0.0015, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1235 }, { "completion_length": 203.75, "epoch": 0.309, "grad_norm": 0.4629731774330139, "kl": 0.044374044984579086, "learning_rate": 4.3636899992071555e-06, "loss": 0.0018, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1236 }, { "completion_length": 159.375, "epoch": 0.30925, "grad_norm": 0.017106125131249428, "kl": 0.03975909203290939, "learning_rate": 4.362235141479055e-06, "loss": 0.0016, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1237 }, { "completion_length": 151.75, "epoch": 0.3095, "grad_norm": 0.4982658326625824, "kl": 0.056246038526296616, "learning_rate": 4.360778865577885e-06, "loss": 0.0022, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1238 }, { "completion_length": 155.75, "epoch": 0.30975, "grad_norm": 0.7274753451347351, "kl": 0.05836237221956253, "learning_rate": 4.359321172612664e-06, "loss": 0.0023, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 1.0, "step": 1239 }, { "completion_length": 119.375, "epoch": 0.31, "grad_norm": 0.7453700304031372, "kl": 0.05975281819701195, "learning_rate": 4.357862063693486e-06, "loss": 0.0024, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1240 }, { "completion_length": 136.75, "epoch": 0.31025, "grad_norm": 0.6040889620780945, "kl": 0.06652972102165222, "learning_rate": 4.356401539931528e-06, "loss": 0.0027, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1241 }, { "completion_length": 173.25, "epoch": 0.3105, "grad_norm": 0.44152548909187317, "kl": 0.04805905371904373, "learning_rate": 4.354939602439041e-06, "loss": 0.0019, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1242 }, { "completion_length": 158.5, "epoch": 0.31075, "grad_norm": 0.021656127646565437, "kl": 0.05573081597685814, "learning_rate": 4.353476252329356e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1243 }, { "completion_length": 117.5, "epoch": 0.311, "grad_norm": 0.023949656635522842, "kl": 0.057754624634981155, "learning_rate": 4.352011490716875e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1244 }, { "completion_length": 166.375, "epoch": 0.31125, "grad_norm": 0.7155207395553589, "kl": 0.045125432312488556, "learning_rate": 4.350545318717081e-06, "loss": 0.0018, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.6375000476837158, "rewards/_format_reward": 0.875, "step": 1245 }, { "completion_length": 131.0, "epoch": 0.3115, "grad_norm": 0.5737600922584534, "kl": 0.035169921815395355, "learning_rate": 4.349077737446525e-06, "loss": 0.0014, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1246 }, { "completion_length": 192.75, "epoch": 0.31175, "grad_norm": 0.5308739542961121, "kl": 0.04862486571073532, "learning_rate": 4.347608748022835e-06, "loss": 0.0019, "reward": 1.2774999141693115, "reward_std": 0.44627827405929565, "rewards/_accuracy_reward": 0.2774999737739563, "rewards/_format_reward": 1.0, "step": 1247 }, { "completion_length": 128.5, "epoch": 0.312, "grad_norm": 0.8783921003341675, "kl": 0.0623704232275486, "learning_rate": 4.346138351564711e-06, "loss": 0.0025, "reward": 1.401249885559082, "reward_std": 0.49599650502204895, "rewards/_accuracy_reward": 0.4012499749660492, "rewards/_format_reward": 1.0, "step": 1248 }, { "completion_length": 170.5, "epoch": 0.31225, "grad_norm": 0.6559812426567078, "kl": 0.05016009509563446, "learning_rate": 4.344666549191921e-06, "loss": 0.002, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1249 }, { "completion_length": 194.25, "epoch": 0.3125, "grad_norm": 0.4926063120365143, "kl": 0.037470243871212006, "learning_rate": 4.34319334202531e-06, "loss": 0.0015, "reward": 1.256250023841858, "reward_std": 0.8845650553703308, "rewards/_accuracy_reward": 0.6312500238418579, "rewards/_format_reward": 0.625, "step": 1250 }, { "completion_length": 163.25, "epoch": 0.31275, "grad_norm": 0.6643059253692627, "kl": 0.0781041607260704, "learning_rate": 4.341718731186788e-06, "loss": 0.0031, "reward": 1.5187499523162842, "reward_std": 0.7323824167251587, "rewards/_accuracy_reward": 0.643750011920929, "rewards/_format_reward": 0.875, "step": 1251 }, { "completion_length": 168.625, "epoch": 0.313, "grad_norm": 0.5912598967552185, "kl": 0.0778733566403389, "learning_rate": 4.340242717799337e-06, "loss": 0.0031, "reward": 1.3875000476837158, "reward_std": 0.7371518611907959, "rewards/_accuracy_reward": 0.6375000476837158, "rewards/_format_reward": 0.75, "step": 1252 }, { "completion_length": 148.0, "epoch": 0.31325, "grad_norm": 0.022581512108445168, "kl": 0.06389284133911133, "learning_rate": 4.338765302987001e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1253 }, { "completion_length": 123.625, "epoch": 0.3135, "grad_norm": 0.023221751675009727, "kl": 0.05139881372451782, "learning_rate": 4.3372864878749e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1254 }, { "completion_length": 175.0, "epoch": 0.31375, "grad_norm": 0.5853912830352783, "kl": 0.05094706267118454, "learning_rate": 4.335806273589214e-06, "loss": 0.002, "reward": 1.5012500286102295, "reward_std": 0.9235093593597412, "rewards/_accuracy_reward": 0.7512500286102295, "rewards/_format_reward": 0.75, "step": 1255 }, { "completion_length": 187.0, "epoch": 0.314, "grad_norm": 0.4903429448604584, "kl": 0.05913626775145531, "learning_rate": 4.334324661257191e-06, "loss": 0.0024, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1256 }, { "completion_length": 158.75, "epoch": 0.31425, "grad_norm": 0.5297297239303589, "kl": 0.03816875442862511, "learning_rate": 4.332841652007144e-06, "loss": 0.0015, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1257 }, { "completion_length": 121.125, "epoch": 0.3145, "grad_norm": 0.8008362650871277, "kl": 0.055129993706941605, "learning_rate": 4.331357246968447e-06, "loss": 0.0022, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 1258 }, { "completion_length": 179.0, "epoch": 0.31475, "grad_norm": 0.020386753603816032, "kl": 0.04700789228081703, "learning_rate": 4.329871447271541e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1259 }, { "completion_length": 122.875, "epoch": 0.315, "grad_norm": 0.7078530192375183, "kl": 0.03913933411240578, "learning_rate": 4.328384254047927e-06, "loss": 0.0016, "reward": 1.6687500476837158, "reward_std": 0.4613160789012909, "rewards/_accuracy_reward": 0.668749988079071, "rewards/_format_reward": 1.0, "step": 1260 }, { "completion_length": 102.625, "epoch": 0.31525, "grad_norm": 0.8336498141288757, "kl": 0.09925613552331924, "learning_rate": 4.326895668430166e-06, "loss": 0.004, "reward": 1.5012500286102295, "reward_std": 0.42089828848838806, "rewards/_accuracy_reward": 0.5012500286102295, "rewards/_format_reward": 1.0, "step": 1261 }, { "completion_length": 163.5, "epoch": 0.3155, "grad_norm": 0.6298437118530273, "kl": 0.06484002619981766, "learning_rate": 4.3254056915518815e-06, "loss": 0.0026, "reward": 1.631250023841858, "reward_std": 0.7382108569145203, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 1262 }, { "completion_length": 180.0, "epoch": 0.31575, "grad_norm": 0.6922435164451599, "kl": 0.06569919735193253, "learning_rate": 4.323914324547755e-06, "loss": 0.0026, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1263 }, { "completion_length": 120.625, "epoch": 0.316, "grad_norm": 0.8579962253570557, "kl": 0.06102241203188896, "learning_rate": 4.322421568553529e-06, "loss": 0.0024, "reward": 1.6687500476837158, "reward_std": 0.4613160789012909, "rewards/_accuracy_reward": 0.6687500476837158, "rewards/_format_reward": 1.0, "step": 1264 }, { "completion_length": 148.25, "epoch": 0.31625, "grad_norm": 0.5821033120155334, "kl": 0.04418959096074104, "learning_rate": 4.320927424706001e-06, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1265 }, { "completion_length": 171.25, "epoch": 0.3165, "grad_norm": 0.6433318853378296, "kl": 0.05075891688466072, "learning_rate": 4.319431894143027e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1266 }, { "completion_length": 142.125, "epoch": 0.31675, "grad_norm": 0.5217633247375488, "kl": 0.03551534563302994, "learning_rate": 4.317934978003517e-06, "loss": 0.0014, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1267 }, { "completion_length": 146.25, "epoch": 0.317, "grad_norm": 0.6079625487327576, "kl": 0.03397079184651375, "learning_rate": 4.316436677427441e-06, "loss": 0.0014, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1268 }, { "completion_length": 161.625, "epoch": 0.31725, "grad_norm": 0.6995141506195068, "kl": 0.031033797189593315, "learning_rate": 4.314936993555816e-06, "loss": 0.0012, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1269 }, { "completion_length": 173.25, "epoch": 0.3175, "grad_norm": 0.593582808971405, "kl": 0.048646219074726105, "learning_rate": 4.313435927530719e-06, "loss": 0.0019, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1270 }, { "completion_length": 154.125, "epoch": 0.31775, "grad_norm": 0.7062419056892395, "kl": 0.04136907681822777, "learning_rate": 4.311933480495278e-06, "loss": 0.0017, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1271 }, { "completion_length": 159.75, "epoch": 0.318, "grad_norm": 0.4106045663356781, "kl": 0.057584941387176514, "learning_rate": 4.3104296535936695e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1272 }, { "completion_length": 132.5, "epoch": 0.31825, "grad_norm": 0.06259723007678986, "kl": 0.07464718818664551, "learning_rate": 4.308924447971123e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1273 }, { "completion_length": 143.125, "epoch": 0.3185, "grad_norm": 0.025278618559241295, "kl": 0.04770468547940254, "learning_rate": 4.3074178647739205e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1274 }, { "completion_length": 153.375, "epoch": 0.31875, "grad_norm": 0.6714683771133423, "kl": 0.0832626223564148, "learning_rate": 4.305909905149389e-06, "loss": 0.0033, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 1275 }, { "completion_length": 152.0, "epoch": 0.319, "grad_norm": 0.03329375758767128, "kl": 0.06376402080059052, "learning_rate": 4.3044005702459055e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1276 }, { "completion_length": 150.125, "epoch": 0.31925, "grad_norm": 0.0274689681828022, "kl": 0.0662902370095253, "learning_rate": 4.302889861212894e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1277 }, { "completion_length": 151.0, "epoch": 0.3195, "grad_norm": 0.5757763981819153, "kl": 0.0469672717154026, "learning_rate": 4.301377779200826e-06, "loss": 0.0019, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1278 }, { "completion_length": 106.25, "epoch": 0.31975, "grad_norm": 0.015396623872220516, "kl": 0.08767110854387283, "learning_rate": 4.299864325361217e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1279 }, { "completion_length": 147.75, "epoch": 0.32, "grad_norm": 1.0080978870391846, "kl": 0.050190072506666183, "learning_rate": 4.2983495008466285e-06, "loss": 0.002, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1280 }, { "completion_length": 139.625, "epoch": 0.32025, "grad_norm": 0.04891718551516533, "kl": 0.08530929684638977, "learning_rate": 4.2968333068106635e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1281 }, { "completion_length": 175.25, "epoch": 0.3205, "grad_norm": 0.6860036253929138, "kl": 0.09078608453273773, "learning_rate": 4.295315744407972e-06, "loss": 0.0036, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1282 }, { "completion_length": 172.75, "epoch": 0.32075, "grad_norm": 0.03762355074286461, "kl": 0.05991184711456299, "learning_rate": 4.293796814794243e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1283 }, { "completion_length": 176.375, "epoch": 0.321, "grad_norm": 0.5721231698989868, "kl": 0.05818440765142441, "learning_rate": 4.2922765191262075e-06, "loss": 0.0023, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1284 }, { "completion_length": 133.125, "epoch": 0.32125, "grad_norm": 0.7689949870109558, "kl": 0.0632908046245575, "learning_rate": 4.290754858561636e-06, "loss": 0.0025, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1285 }, { "completion_length": 163.75, "epoch": 0.3215, "grad_norm": 0.44197624921798706, "kl": 0.06356453895568848, "learning_rate": 4.28923183425934e-06, "loss": 0.0025, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1286 }, { "completion_length": 109.75, "epoch": 0.32175, "grad_norm": 0.8185544610023499, "kl": 0.07203707098960876, "learning_rate": 4.287707447379169e-06, "loss": 0.0029, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1287 }, { "completion_length": 157.75, "epoch": 0.322, "grad_norm": 0.8443469405174255, "kl": 0.06975241750478745, "learning_rate": 4.286181699082008e-06, "loss": 0.0028, "reward": 1.337499976158142, "reward_std": 0.41811659932136536, "rewards/_accuracy_reward": 0.3374999761581421, "rewards/_format_reward": 1.0, "step": 1288 }, { "completion_length": 99.25, "epoch": 0.32225, "grad_norm": 0.017100084573030472, "kl": 0.061960458755493164, "learning_rate": 4.284654590529784e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1289 }, { "completion_length": 148.5, "epoch": 0.3225, "grad_norm": 0.7848110198974609, "kl": 0.05555134639143944, "learning_rate": 4.283126122885455e-06, "loss": 0.0022, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1290 }, { "completion_length": 117.375, "epoch": 0.32275, "grad_norm": 0.7877644896507263, "kl": 0.09985921531915665, "learning_rate": 4.281596297313014e-06, "loss": 0.004, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 1291 }, { "completion_length": 154.25, "epoch": 0.323, "grad_norm": 0.030576931312680244, "kl": 0.05328349769115448, "learning_rate": 4.280065114977492e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1292 }, { "completion_length": 120.5, "epoch": 0.32325, "grad_norm": 0.12656356394290924, "kl": 0.08763141185045242, "learning_rate": 4.278532577044949e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1293 }, { "completion_length": 171.25, "epoch": 0.3235, "grad_norm": 0.6466588377952576, "kl": 0.06330376863479614, "learning_rate": 4.276998684682482e-06, "loss": 0.0025, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1294 }, { "completion_length": 114.25, "epoch": 0.32375, "grad_norm": 0.03600083664059639, "kl": 0.07176019996404648, "learning_rate": 4.275463439058214e-06, "loss": 0.0029, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1295 }, { "completion_length": 181.5, "epoch": 0.324, "grad_norm": 0.6595420837402344, "kl": 0.0831797644495964, "learning_rate": 4.273926841341303e-06, "loss": 0.0033, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 1296 }, { "completion_length": 171.125, "epoch": 0.32425, "grad_norm": 0.037904031574726105, "kl": 0.06669317185878754, "learning_rate": 4.272388892701934e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1297 }, { "completion_length": 173.0, "epoch": 0.3245, "grad_norm": 0.7312850952148438, "kl": 0.05984557047486305, "learning_rate": 4.270849594311323e-06, "loss": 0.0024, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.75, "step": 1298 }, { "completion_length": 161.125, "epoch": 0.32475, "grad_norm": 0.6081017255783081, "kl": 0.04914931207895279, "learning_rate": 4.269308947341711e-06, "loss": 0.002, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 1299 }, { "completion_length": 167.0, "epoch": 0.325, "grad_norm": 0.6635521650314331, "kl": 0.06111575663089752, "learning_rate": 4.267766952966369e-06, "loss": 0.0024, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1300 }, { "completion_length": 176.5, "epoch": 0.32525, "grad_norm": 0.042982131242752075, "kl": 0.06564896553754807, "learning_rate": 4.266223612359593e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1301 }, { "completion_length": 163.625, "epoch": 0.3255, "grad_norm": 0.638775110244751, "kl": 0.057519882917404175, "learning_rate": 4.264678926696703e-06, "loss": 0.0023, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1302 }, { "completion_length": 137.5, "epoch": 0.32575, "grad_norm": 0.7242026329040527, "kl": 0.057923607528209686, "learning_rate": 4.263132897154044e-06, "loss": 0.0023, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1303 }, { "completion_length": 119.0, "epoch": 0.326, "grad_norm": 0.5665600895881653, "kl": 0.052210718393325806, "learning_rate": 4.261585524908987e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1304 }, { "completion_length": 184.0, "epoch": 0.32625, "grad_norm": 0.6595292091369629, "kl": 0.07032403349876404, "learning_rate": 4.260036811139922e-06, "loss": 0.0028, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1305 }, { "completion_length": 185.625, "epoch": 0.3265, "grad_norm": 0.031764958053827286, "kl": 0.05210375413298607, "learning_rate": 4.25848675702626e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1306 }, { "completion_length": 198.5, "epoch": 0.32675, "grad_norm": 0.5040040016174316, "kl": 0.04720192775130272, "learning_rate": 4.256935363748437e-06, "loss": 0.0019, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.75, "step": 1307 }, { "completion_length": 216.25, "epoch": 0.327, "grad_norm": 0.43086105585098267, "kl": 0.04231351986527443, "learning_rate": 4.255382632487907e-06, "loss": 0.0017, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 1308 }, { "completion_length": 133.375, "epoch": 0.32725, "grad_norm": 0.6797394752502441, "kl": 0.04834214597940445, "learning_rate": 4.25382856442714e-06, "loss": 0.0019, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1309 }, { "completion_length": 132.375, "epoch": 0.3275, "grad_norm": 0.6795439720153809, "kl": 0.0568021684885025, "learning_rate": 4.2522731607496275e-06, "loss": 0.0023, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1310 }, { "completion_length": 168.625, "epoch": 0.32775, "grad_norm": 0.6751994490623474, "kl": 0.0991804301738739, "learning_rate": 4.250716422639878e-06, "loss": 0.004, "reward": 1.212499976158142, "reward_std": 0.3324691653251648, "rewards/_accuracy_reward": 0.3374999761581421, "rewards/_format_reward": 0.875, "step": 1311 }, { "completion_length": 186.625, "epoch": 0.328, "grad_norm": 0.7776236534118652, "kl": 0.042616959661245346, "learning_rate": 4.249158351283414e-06, "loss": 0.0017, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1312 }, { "completion_length": 133.25, "epoch": 0.32825, "grad_norm": 0.02199604921042919, "kl": 0.05136921629309654, "learning_rate": 4.247598947866775e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1313 }, { "completion_length": 164.875, "epoch": 0.3285, "grad_norm": 0.5183839797973633, "kl": 0.06551958620548248, "learning_rate": 4.246038213577516e-06, "loss": 0.0026, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1314 }, { "completion_length": 170.875, "epoch": 0.32875, "grad_norm": 0.5866003632545471, "kl": 0.05409818887710571, "learning_rate": 4.244476149604201e-06, "loss": 0.0022, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1315 }, { "completion_length": 139.625, "epoch": 0.329, "grad_norm": 0.7940442562103271, "kl": 0.05304113030433655, "learning_rate": 4.242912757136412e-06, "loss": 0.0021, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1316 }, { "completion_length": 205.0, "epoch": 0.32925, "grad_norm": 0.5274433493614197, "kl": 0.06083288788795471, "learning_rate": 4.24134803736474e-06, "loss": 0.0024, "reward": 1.2625000476837158, "reward_std": 0.8826704621315002, "rewards/_accuracy_reward": 0.512499988079071, "rewards/_format_reward": 0.75, "step": 1317 }, { "completion_length": 181.125, "epoch": 0.3295, "grad_norm": 0.5549687147140503, "kl": 0.06591016054153442, "learning_rate": 4.239781991480786e-06, "loss": 0.0026, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1318 }, { "completion_length": 132.75, "epoch": 0.32975, "grad_norm": 0.5778529047966003, "kl": 0.06366316229104996, "learning_rate": 4.238214620677164e-06, "loss": 0.0025, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1319 }, { "completion_length": 136.375, "epoch": 0.33, "grad_norm": 0.024910366162657738, "kl": 0.04503370448946953, "learning_rate": 4.236645926147493e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1320 }, { "completion_length": 173.625, "epoch": 0.33025, "grad_norm": 0.5607073307037354, "kl": 0.05769224464893341, "learning_rate": 4.235075909086405e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1321 }, { "completion_length": 100.625, "epoch": 0.3305, "grad_norm": 1.6146537065505981, "kl": 0.4625369906425476, "learning_rate": 4.233504570689533e-06, "loss": 0.0185, "reward": 1.693750023841858, "reward_std": 0.42714792490005493, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 1.0, "step": 1322 }, { "completion_length": 171.125, "epoch": 0.33075, "grad_norm": 0.632230818271637, "kl": 0.04798278212547302, "learning_rate": 4.231931912153521e-06, "loss": 0.0019, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 1323 }, { "completion_length": 138.125, "epoch": 0.331, "grad_norm": 0.022344160825014114, "kl": 0.04536424204707146, "learning_rate": 4.230357934676017e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1324 }, { "completion_length": 169.875, "epoch": 0.33125, "grad_norm": 0.5961106419563293, "kl": 0.06459388136863708, "learning_rate": 4.228782639455674e-06, "loss": 0.0026, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1325 }, { "completion_length": 145.875, "epoch": 0.3315, "grad_norm": 0.5319207906723022, "kl": 0.04201117902994156, "learning_rate": 4.227206027692146e-06, "loss": 0.0017, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1326 }, { "completion_length": 178.5, "epoch": 0.33175, "grad_norm": 0.607018768787384, "kl": 0.04852335527539253, "learning_rate": 4.225628100586093e-06, "loss": 0.0019, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1327 }, { "completion_length": 133.125, "epoch": 0.332, "grad_norm": 0.019281940534710884, "kl": 0.0670793280005455, "learning_rate": 4.224048859339175e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1328 }, { "completion_length": 133.625, "epoch": 0.33225, "grad_norm": 0.7307944297790527, "kl": 0.06218728423118591, "learning_rate": 4.222468305154052e-06, "loss": 0.0025, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1329 }, { "completion_length": 157.625, "epoch": 0.3325, "grad_norm": 0.48206326365470886, "kl": 0.04004458710551262, "learning_rate": 4.220886439234385e-06, "loss": 0.0016, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1330 }, { "completion_length": 160.75, "epoch": 0.33275, "grad_norm": 0.036685239523649216, "kl": 0.06175795570015907, "learning_rate": 4.219303262784834e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1331 }, { "completion_length": 153.25, "epoch": 0.333, "grad_norm": 0.6564381718635559, "kl": 0.04746977239847183, "learning_rate": 4.217718777011058e-06, "loss": 0.0019, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1332 }, { "completion_length": 139.5, "epoch": 0.33325, "grad_norm": 0.5996315479278564, "kl": 0.04581526294350624, "learning_rate": 4.2161329831197095e-06, "loss": 0.0018, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 1333 }, { "completion_length": 170.25, "epoch": 0.3335, "grad_norm": 0.6188631057739258, "kl": 0.05417291074991226, "learning_rate": 4.2145458823184414e-06, "loss": 0.0022, "reward": 1.274999976158142, "reward_std": 0.6974443197250366, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.875, "step": 1334 }, { "completion_length": 148.0, "epoch": 0.33375, "grad_norm": 0.023063072934746742, "kl": 0.04357193037867546, "learning_rate": 4.212957475815898e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1335 }, { "completion_length": 110.375, "epoch": 0.334, "grad_norm": 0.8400101661682129, "kl": 0.05062510818243027, "learning_rate": 4.211367764821722e-06, "loss": 0.002, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1336 }, { "completion_length": 174.0, "epoch": 0.33425, "grad_norm": 0.6004985570907593, "kl": 0.03446627035737038, "learning_rate": 4.209776750546547e-06, "loss": 0.0014, "reward": 1.78125, "reward_std": 0.6187184453010559, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 1337 }, { "completion_length": 151.125, "epoch": 0.3345, "grad_norm": 0.5468786954879761, "kl": 0.06371209770441055, "learning_rate": 4.208184434201999e-06, "loss": 0.0025, "reward": 1.1687499284744263, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.16875000298023224, "rewards/_format_reward": 1.0, "step": 1338 }, { "completion_length": 172.375, "epoch": 0.33475, "grad_norm": 0.024628562852740288, "kl": 0.04282301664352417, "learning_rate": 4.206590817000695e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1339 }, { "completion_length": 211.25, "epoch": 0.335, "grad_norm": 0.47119390964508057, "kl": 0.06950601935386658, "learning_rate": 4.204995900156247e-06, "loss": 0.0028, "reward": 1.6262500286102295, "reward_std": 0.7428312301635742, "rewards/_accuracy_reward": 0.7512500286102295, "rewards/_format_reward": 0.875, "step": 1340 }, { "completion_length": 150.25, "epoch": 0.33525, "grad_norm": 0.08035016059875488, "kl": 0.0573977455496788, "learning_rate": 4.20339968488325e-06, "loss": 0.0023, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 1341 }, { "completion_length": 160.25, "epoch": 0.3355, "grad_norm": 0.7187206149101257, "kl": 0.07497703284025192, "learning_rate": 4.201802172397295e-06, "loss": 0.003, "reward": 1.1512500047683716, "reward_std": 0.6355074048042297, "rewards/_accuracy_reward": 0.2762500047683716, "rewards/_format_reward": 0.875, "step": 1342 }, { "completion_length": 173.75, "epoch": 0.33575, "grad_norm": 0.5304349660873413, "kl": 0.08109744638204575, "learning_rate": 4.2002033639149545e-06, "loss": 0.0032, "reward": 1.537500023841858, "reward_std": 0.7322909235954285, "rewards/_accuracy_reward": 0.6625000238418579, "rewards/_format_reward": 0.875, "step": 1343 }, { "completion_length": 176.875, "epoch": 0.336, "grad_norm": 0.6054671406745911, "kl": 0.05511131510138512, "learning_rate": 4.198603260653792e-06, "loss": 0.0022, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1344 }, { "completion_length": 162.75, "epoch": 0.33625, "grad_norm": 0.025135153904557228, "kl": 0.05139942467212677, "learning_rate": 4.197001863832355e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1345 }, { "completion_length": 137.625, "epoch": 0.3365, "grad_norm": 0.6007516980171204, "kl": 0.05004937946796417, "learning_rate": 4.195399174670177e-06, "loss": 0.002, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1346 }, { "completion_length": 197.875, "epoch": 0.33675, "grad_norm": 0.5018278956413269, "kl": 0.05320580676198006, "learning_rate": 4.193795194387776e-06, "loss": 0.0021, "reward": 1.1437499523162842, "reward_std": 0.8317097425460815, "rewards/_accuracy_reward": 0.39375001192092896, "rewards/_format_reward": 0.75, "step": 1347 }, { "completion_length": 96.375, "epoch": 0.337, "grad_norm": 0.7763251662254333, "kl": 0.13074029982089996, "learning_rate": 4.192189924206652e-06, "loss": 0.0052, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 1348 }, { "completion_length": 173.75, "epoch": 0.33725, "grad_norm": 0.021658629179000854, "kl": 0.04604887589812279, "learning_rate": 4.190583365349289e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1349 }, { "completion_length": 183.25, "epoch": 0.3375, "grad_norm": 0.5501531958580017, "kl": 0.05152320861816406, "learning_rate": 4.188975519039151e-06, "loss": 0.0021, "reward": 1.274999976158142, "reward_std": 0.6974443197250366, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.875, "step": 1350 }, { "completion_length": 216.5, "epoch": 0.33775, "grad_norm": 0.547321081161499, "kl": 0.05665482208132744, "learning_rate": 4.1873663865006835e-06, "loss": 0.0023, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1351 }, { "completion_length": 162.75, "epoch": 0.338, "grad_norm": 0.6683735847473145, "kl": 0.043146222829818726, "learning_rate": 4.185755968959308e-06, "loss": 0.0017, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1352 }, { "completion_length": 170.375, "epoch": 0.33825, "grad_norm": 0.6152597665786743, "kl": 0.05195571482181549, "learning_rate": 4.184144267641433e-06, "loss": 0.0021, "reward": 0.8124999403953552, "reward_std": 0.5062113404273987, "rewards/_accuracy_reward": 0.0625, "rewards/_format_reward": 0.75, "step": 1353 }, { "completion_length": 150.125, "epoch": 0.3385, "grad_norm": 0.7096376419067383, "kl": 0.054735492914915085, "learning_rate": 4.182531283774434e-06, "loss": 0.0022, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1354 }, { "completion_length": 138.0, "epoch": 0.33875, "grad_norm": 0.024319298565387726, "kl": 0.051283035427331924, "learning_rate": 4.18091701858667e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1355 }, { "completion_length": 108.25, "epoch": 0.339, "grad_norm": 0.8185098171234131, "kl": 0.06744442135095596, "learning_rate": 4.179301473307476e-06, "loss": 0.0027, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1356 }, { "completion_length": 154.25, "epoch": 0.33925, "grad_norm": 0.5948598384857178, "kl": 0.059390176087617874, "learning_rate": 4.177684649167158e-06, "loss": 0.0024, "reward": 1.7574999332427979, "reward_std": 0.4491499960422516, "rewards/_accuracy_reward": 0.7574999928474426, "rewards/_format_reward": 1.0, "step": 1357 }, { "completion_length": 178.125, "epoch": 0.3395, "grad_norm": 0.023550184443593025, "kl": 0.04499709606170654, "learning_rate": 4.176066547396998e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1358 }, { "completion_length": 153.375, "epoch": 0.33975, "grad_norm": 0.027199365198612213, "kl": 0.05625481531023979, "learning_rate": 4.174447169229252e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1359 }, { "completion_length": 137.875, "epoch": 0.34, "grad_norm": 0.019031843170523643, "kl": 0.051029808819293976, "learning_rate": 4.172826515897146e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1360 }, { "completion_length": 158.75, "epoch": 0.34025, "grad_norm": 0.03614223375916481, "kl": 0.06285353004932404, "learning_rate": 4.171204588634878e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1361 }, { "completion_length": 122.625, "epoch": 0.3405, "grad_norm": 0.5458539128303528, "kl": 0.06035559996962547, "learning_rate": 4.169581388677617e-06, "loss": 0.0024, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1362 }, { "completion_length": 120.125, "epoch": 0.34075, "grad_norm": 0.029562877491116524, "kl": 0.0810147076845169, "learning_rate": 4.1679569172614994e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1363 }, { "completion_length": 188.125, "epoch": 0.341, "grad_norm": 0.6008427143096924, "kl": 0.06007464975118637, "learning_rate": 4.166331175623631e-06, "loss": 0.0024, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1364 }, { "completion_length": 129.875, "epoch": 0.34125, "grad_norm": 0.7642394304275513, "kl": 0.07598597556352615, "learning_rate": 4.164704165002086e-06, "loss": 0.003, "reward": 1.4075000286102295, "reward_std": 0.3749571442604065, "rewards/_accuracy_reward": 0.4074999988079071, "rewards/_format_reward": 1.0, "step": 1365 }, { "completion_length": 156.5, "epoch": 0.3415, "grad_norm": 0.024682415649294853, "kl": 0.049706555902957916, "learning_rate": 4.163075886635902e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1366 }, { "completion_length": 106.875, "epoch": 0.34175, "grad_norm": 0.024045802652835846, "kl": 0.04866664111614227, "learning_rate": 4.161446341765085e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1367 }, { "completion_length": 156.25, "epoch": 0.342, "grad_norm": 0.6025788187980652, "kl": 0.051358725875616074, "learning_rate": 4.159815531630604e-06, "loss": 0.0021, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1368 }, { "completion_length": 155.625, "epoch": 0.34225, "grad_norm": 0.035298582166433334, "kl": 0.05683750659227371, "learning_rate": 4.158183457474392e-06, "loss": 0.0023, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/_accuracy_reward": 0.05000000074505806, "rewards/_format_reward": 1.0, "step": 1369 }, { "completion_length": 145.5, "epoch": 0.3425, "grad_norm": 0.6779209971427917, "kl": 0.08619740605354309, "learning_rate": 4.1565501205393445e-06, "loss": 0.0034, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1370 }, { "completion_length": 147.25, "epoch": 0.34275, "grad_norm": 0.055118631571531296, "kl": 0.07643330842256546, "learning_rate": 4.154915522069318e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1371 }, { "completion_length": 158.0, "epoch": 0.343, "grad_norm": 0.03090520389378071, "kl": 0.042203597724437714, "learning_rate": 4.15327966330913e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1372 }, { "completion_length": 151.125, "epoch": 0.34325, "grad_norm": 0.02153955027461052, "kl": 0.04990649223327637, "learning_rate": 4.15164254550456e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1373 }, { "completion_length": 172.25, "epoch": 0.3435, "grad_norm": 0.7357903718948364, "kl": 0.07494698464870453, "learning_rate": 4.150004169902343e-06, "loss": 0.003, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1374 }, { "completion_length": 138.75, "epoch": 0.34375, "grad_norm": 0.6570234894752502, "kl": 0.11442865431308746, "learning_rate": 4.1483645377501726e-06, "loss": 0.0046, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 1375 }, { "completion_length": 145.75, "epoch": 0.344, "grad_norm": 0.03160668909549713, "kl": 0.057240959256887436, "learning_rate": 4.146723650296701e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1376 }, { "completion_length": 139.5, "epoch": 0.34425, "grad_norm": 0.6059741973876953, "kl": 0.05288849398493767, "learning_rate": 4.145081508791536e-06, "loss": 0.0021, "reward": 1.7575000524520874, "reward_std": 0.449150025844574, "rewards/_accuracy_reward": 0.7575000524520874, "rewards/_format_reward": 1.0, "step": 1377 }, { "completion_length": 166.0, "epoch": 0.3445, "grad_norm": 0.6067208647727966, "kl": 0.056651707738637924, "learning_rate": 4.14343811448524e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1378 }, { "completion_length": 162.0, "epoch": 0.34475, "grad_norm": 0.5100969672203064, "kl": 0.06123419851064682, "learning_rate": 4.141793468629327e-06, "loss": 0.0024, "reward": 1.181249976158142, "reward_std": 0.6335486769676208, "rewards/_accuracy_reward": 0.3062499761581421, "rewards/_format_reward": 0.875, "step": 1379 }, { "completion_length": 117.125, "epoch": 0.345, "grad_norm": 0.5899714231491089, "kl": 0.07519141584634781, "learning_rate": 4.140147572476269e-06, "loss": 0.003, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1380 }, { "completion_length": 179.625, "epoch": 0.34525, "grad_norm": 0.471913605928421, "kl": 0.04343201965093613, "learning_rate": 4.138500427279485e-06, "loss": 0.0017, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1381 }, { "completion_length": 120.875, "epoch": 0.3455, "grad_norm": 0.021074136719107628, "kl": 0.058206070214509964, "learning_rate": 4.136852034293349e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1382 }, { "completion_length": 163.0, "epoch": 0.34575, "grad_norm": 0.5675240159034729, "kl": 0.05850347504019737, "learning_rate": 4.135202394773186e-06, "loss": 0.0023, "reward": 1.0374999046325684, "reward_std": 0.5350233912467957, "rewards/_accuracy_reward": 0.16249999403953552, "rewards/_format_reward": 0.875, "step": 1383 }, { "completion_length": 139.25, "epoch": 0.346, "grad_norm": 0.5779051184654236, "kl": 0.029702937230467796, "learning_rate": 4.133551509975264e-06, "loss": 0.0012, "reward": 1.7825000286102295, "reward_std": 0.40780770778656006, "rewards/_accuracy_reward": 0.7825000286102295, "rewards/_format_reward": 1.0, "step": 1384 }, { "completion_length": 109.125, "epoch": 0.34625, "grad_norm": 0.026169802993535995, "kl": 0.051198095083236694, "learning_rate": 4.1318993811568065e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1385 }, { "completion_length": 98.5, "epoch": 0.3465, "grad_norm": 0.02033095993101597, "kl": 0.06018479913473129, "learning_rate": 4.130246009575981e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1386 }, { "completion_length": 134.75, "epoch": 0.34675, "grad_norm": 0.7308709025382996, "kl": 0.06635252386331558, "learning_rate": 4.128591396491901e-06, "loss": 0.0027, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1387 }, { "completion_length": 181.875, "epoch": 0.347, "grad_norm": 0.5132285356521606, "kl": 0.03996492922306061, "learning_rate": 4.126935543164628e-06, "loss": 0.0016, "reward": 1.1437499523162842, "reward_std": 0.8317097425460815, "rewards/_accuracy_reward": 0.39374998211860657, "rewards/_format_reward": 0.75, "step": 1388 }, { "completion_length": 199.75, "epoch": 0.34725, "grad_norm": 0.6100365519523621, "kl": 0.058336708694696426, "learning_rate": 4.125278450855165e-06, "loss": 0.0023, "reward": 1.6262500286102295, "reward_std": 0.7428312301635742, "rewards/_accuracy_reward": 0.7512500286102295, "rewards/_format_reward": 0.875, "step": 1389 }, { "completion_length": 157.0, "epoch": 0.3475, "grad_norm": 0.668353259563446, "kl": 0.056433651596307755, "learning_rate": 4.123620120825459e-06, "loss": 0.0023, "reward": 1.6637499332427979, "reward_std": 0.4691310524940491, "rewards/_accuracy_reward": 0.6637500524520874, "rewards/_format_reward": 1.0, "step": 1390 }, { "completion_length": 138.75, "epoch": 0.34775, "grad_norm": 0.022827059030532837, "kl": 0.043130144476890564, "learning_rate": 4.1219605543384036e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1391 }, { "completion_length": 169.625, "epoch": 0.348, "grad_norm": 0.615968644618988, "kl": 0.045771509408950806, "learning_rate": 4.120299752657828e-06, "loss": 0.0018, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 1392 }, { "completion_length": 185.75, "epoch": 0.34825, "grad_norm": 0.035993240773677826, "kl": 0.0693710520863533, "learning_rate": 4.1186377170485055e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1393 }, { "completion_length": 187.0, "epoch": 0.3485, "grad_norm": 0.5210174918174744, "kl": 0.05211419612169266, "learning_rate": 4.11697444877615e-06, "loss": 0.0021, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1394 }, { "completion_length": 156.875, "epoch": 0.34875, "grad_norm": 0.7188504934310913, "kl": 0.061935752630233765, "learning_rate": 4.11530994910741e-06, "loss": 0.0025, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1395 }, { "completion_length": 125.5, "epoch": 0.349, "grad_norm": 0.8065851330757141, "kl": 0.08348346501588821, "learning_rate": 4.113644219309877e-06, "loss": 0.0033, "reward": 1.600000023841858, "reward_std": 0.43260011076927185, "rewards/_accuracy_reward": 0.6000000238418579, "rewards/_format_reward": 1.0, "step": 1396 }, { "completion_length": 168.875, "epoch": 0.34925, "grad_norm": 0.6905811429023743, "kl": 0.05310038477182388, "learning_rate": 4.1119772606520755e-06, "loss": 0.0021, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1397 }, { "completion_length": 166.875, "epoch": 0.3495, "grad_norm": 0.5804812908172607, "kl": 0.05809628963470459, "learning_rate": 4.110309074403467e-06, "loss": 0.0023, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 1398 }, { "completion_length": 125.75, "epoch": 0.34975, "grad_norm": 0.813575029373169, "kl": 0.034708425402641296, "learning_rate": 4.1086396618344474e-06, "loss": 0.0014, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 1399 }, { "completion_length": 102.125, "epoch": 0.35, "grad_norm": 0.6382107734680176, "kl": 0.07151403278112411, "learning_rate": 4.106969024216348e-06, "loss": 0.0029, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1400 }, { "completion_length": 183.875, "epoch": 0.35025, "grad_norm": 0.5745235681533813, "kl": 0.06930546462535858, "learning_rate": 4.105297162821433e-06, "loss": 0.0028, "reward": 1.7825000286102295, "reward_std": 0.40780770778656006, "rewards/_accuracy_reward": 0.7825000286102295, "rewards/_format_reward": 1.0, "step": 1401 }, { "completion_length": 124.0, "epoch": 0.3505, "grad_norm": 0.7871240377426147, "kl": 0.05033106729388237, "learning_rate": 4.103624078922895e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1402 }, { "completion_length": 108.25, "epoch": 0.35075, "grad_norm": 0.8062891364097595, "kl": 0.0857778936624527, "learning_rate": 4.101949773794862e-06, "loss": 0.0034, "reward": 1.78125, "reward_std": 0.6187184453010559, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 0.875, "step": 1403 }, { "completion_length": 213.875, "epoch": 0.351, "grad_norm": 0.6212908029556274, "kl": 0.06841997802257538, "learning_rate": 4.1002742487123896e-06, "loss": 0.0027, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/_accuracy_reward": 0.625, "rewards/_format_reward": 0.625, "step": 1404 }, { "completion_length": 161.5, "epoch": 0.35125, "grad_norm": 0.7645436525344849, "kl": 0.07526696473360062, "learning_rate": 4.098597504951462e-06, "loss": 0.003, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1405 }, { "completion_length": 195.75, "epoch": 0.3515, "grad_norm": 0.8574343323707581, "kl": 0.08652313798666, "learning_rate": 4.096919543788995e-06, "loss": 0.0035, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 1406 }, { "completion_length": 135.375, "epoch": 0.35175, "grad_norm": 0.03515785187482834, "kl": 0.06902702897787094, "learning_rate": 4.095240366502827e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1407 }, { "completion_length": 163.5, "epoch": 0.352, "grad_norm": 0.5809192061424255, "kl": 0.039451714605093, "learning_rate": 4.093559974371725e-06, "loss": 0.0016, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1408 }, { "completion_length": 131.25, "epoch": 0.35225, "grad_norm": 1.0675450563430786, "kl": 0.037023499608039856, "learning_rate": 4.09187836867538e-06, "loss": 0.0015, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1409 }, { "completion_length": 126.5, "epoch": 0.3525, "grad_norm": 0.03956000879406929, "kl": 0.05268242955207825, "learning_rate": 4.09019555069441e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1410 }, { "completion_length": 158.5, "epoch": 0.35275, "grad_norm": 1.107899785041809, "kl": 0.06893620640039444, "learning_rate": 4.088511521710353e-06, "loss": 0.0028, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.7625000476837158, "rewards/_format_reward": 1.0, "step": 1411 }, { "completion_length": 83.625, "epoch": 0.353, "grad_norm": 0.020886188372969627, "kl": 0.08969815075397491, "learning_rate": 4.086826283005669e-06, "loss": 0.0036, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1412 }, { "completion_length": 149.125, "epoch": 0.35325, "grad_norm": 0.8237113952636719, "kl": 0.060487691313028336, "learning_rate": 4.085139835863743e-06, "loss": 0.0024, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1413 }, { "completion_length": 172.375, "epoch": 0.3535, "grad_norm": 0.7035109996795654, "kl": 0.058261074125766754, "learning_rate": 4.083452181568876e-06, "loss": 0.0023, "reward": 1.3125, "reward_std": 0.4299086928367615, "rewards/_accuracy_reward": 0.3125, "rewards/_format_reward": 1.0, "step": 1414 }, { "completion_length": 164.5, "epoch": 0.35375, "grad_norm": 0.5613075494766235, "kl": 0.08956658095121384, "learning_rate": 4.081763321406291e-06, "loss": 0.0036, "reward": 1.2687499523162842, "reward_std": 0.699968159198761, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.75, "step": 1415 }, { "completion_length": 180.25, "epoch": 0.354, "grad_norm": 1.1197295188903809, "kl": 0.09375526010990143, "learning_rate": 4.080073256662128e-06, "loss": 0.0038, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1416 }, { "completion_length": 128.875, "epoch": 0.35425, "grad_norm": 0.031155169010162354, "kl": 0.0490286611020565, "learning_rate": 4.078381988623445e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1417 }, { "completion_length": 174.625, "epoch": 0.3545, "grad_norm": 0.612561047077179, "kl": 0.06778530031442642, "learning_rate": 4.076689518578217e-06, "loss": 0.0027, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.6375000476837158, "rewards/_format_reward": 0.875, "step": 1418 }, { "completion_length": 87.25, "epoch": 0.35475, "grad_norm": 0.05403704568743706, "kl": 0.04753972589969635, "learning_rate": 4.074995847815331e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1419 }, { "completion_length": 144.375, "epoch": 0.355, "grad_norm": 0.03159556910395622, "kl": 0.05760689452290535, "learning_rate": 4.073300977624594e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1420 }, { "completion_length": 162.625, "epoch": 0.35525, "grad_norm": 0.7645871043205261, "kl": 0.0827740728855133, "learning_rate": 4.0716049092967224e-06, "loss": 0.0033, "reward": 1.3125, "reward_std": 0.4299086630344391, "rewards/_accuracy_reward": 0.3124999701976776, "rewards/_format_reward": 1.0, "step": 1421 }, { "completion_length": 120.75, "epoch": 0.3555, "grad_norm": 0.05223441123962402, "kl": 0.05872947722673416, "learning_rate": 4.069907644123346e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1422 }, { "completion_length": 133.625, "epoch": 0.35575, "grad_norm": 0.7665608525276184, "kl": 0.04824737831950188, "learning_rate": 4.068209183397005e-06, "loss": 0.0019, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 1.0, "step": 1423 }, { "completion_length": 93.625, "epoch": 0.356, "grad_norm": 0.898429274559021, "kl": 0.10655061900615692, "learning_rate": 4.066509528411151e-06, "loss": 0.0043, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1424 }, { "completion_length": 179.625, "epoch": 0.35625, "grad_norm": 0.7845519781112671, "kl": 0.0487191379070282, "learning_rate": 4.064808680460149e-06, "loss": 0.0019, "reward": 1.4562499523162842, "reward_std": 0.45781898498535156, "rewards/_accuracy_reward": 0.45624998211860657, "rewards/_format_reward": 1.0, "step": 1425 }, { "completion_length": 171.625, "epoch": 0.3565, "grad_norm": 0.8606228828430176, "kl": 0.05676417797803879, "learning_rate": 4.063106640839264e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1426 }, { "completion_length": 164.25, "epoch": 0.35675, "grad_norm": 0.6395389437675476, "kl": 0.07135939598083496, "learning_rate": 4.061403410844674e-06, "loss": 0.0029, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1427 }, { "completion_length": 193.0, "epoch": 0.357, "grad_norm": 0.5392956137657166, "kl": 0.048543695360422134, "learning_rate": 4.059698991773466e-06, "loss": 0.0019, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1428 }, { "completion_length": 176.625, "epoch": 0.35725, "grad_norm": 0.7690248489379883, "kl": 0.0833929181098938, "learning_rate": 4.057993384923626e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1429 }, { "completion_length": 133.75, "epoch": 0.3575, "grad_norm": 0.643065869808197, "kl": 0.059519246220588684, "learning_rate": 4.056286591594049e-06, "loss": 0.0024, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 1430 }, { "completion_length": 118.875, "epoch": 0.35775, "grad_norm": 0.72835773229599, "kl": 0.03510938957333565, "learning_rate": 4.0545786130845325e-06, "loss": 0.0014, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 1431 }, { "completion_length": 85.375, "epoch": 0.358, "grad_norm": 0.031654562801122665, "kl": 0.03833214193582535, "learning_rate": 4.052869450695776e-06, "loss": 0.0015, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1432 }, { "completion_length": 117.0, "epoch": 0.35825, "grad_norm": 0.052805446088314056, "kl": 0.07381996512413025, "learning_rate": 4.051159105729382e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1433 }, { "completion_length": 143.875, "epoch": 0.3585, "grad_norm": 0.6013376116752625, "kl": 0.05232910066843033, "learning_rate": 4.049447579487851e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1434 }, { "completion_length": 176.5, "epoch": 0.35875, "grad_norm": 0.03941601887345314, "kl": 0.06202106177806854, "learning_rate": 4.047734873274586e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1435 }, { "completion_length": 221.125, "epoch": 0.359, "grad_norm": 0.7312876582145691, "kl": 0.07595561444759369, "learning_rate": 4.046020988393886e-06, "loss": 0.003, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.2812499701976776, "rewards/_format_reward": 0.875, "step": 1436 }, { "completion_length": 173.5, "epoch": 0.35925, "grad_norm": 0.7492666840553284, "kl": 0.0508357509970665, "learning_rate": 4.04430592615095e-06, "loss": 0.002, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1437 }, { "completion_length": 89.875, "epoch": 0.3595, "grad_norm": 1.046403169631958, "kl": 0.052254389971494675, "learning_rate": 4.0425896878518725e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1438 }, { "completion_length": 149.375, "epoch": 0.35975, "grad_norm": 0.6857521533966064, "kl": 0.06438186764717102, "learning_rate": 4.0408722748036426e-06, "loss": 0.0026, "reward": 1.631250023841858, "reward_std": 0.7382108569145203, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 1439 }, { "completion_length": 160.25, "epoch": 0.36, "grad_norm": 0.8094133138656616, "kl": 0.07312402129173279, "learning_rate": 4.039153688314146e-06, "loss": 0.0029, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1440 }, { "completion_length": 129.625, "epoch": 0.36025, "grad_norm": 0.045799799263477325, "kl": 0.053429555147886276, "learning_rate": 4.037433929692161e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1441 }, { "completion_length": 164.875, "epoch": 0.3605, "grad_norm": 0.8309241533279419, "kl": 0.08402340859174728, "learning_rate": 4.035713000247358e-06, "loss": 0.0034, "reward": 1.756250023841858, "reward_std": 0.45153507590293884, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 1442 }, { "completion_length": 129.625, "epoch": 0.36075, "grad_norm": 0.04994899779558182, "kl": 0.08733442425727844, "learning_rate": 4.033990901290301e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1443 }, { "completion_length": 160.0, "epoch": 0.361, "grad_norm": 0.78934246301651, "kl": 0.07535295933485031, "learning_rate": 4.032267634132442e-06, "loss": 0.003, "reward": 1.631250023841858, "reward_std": 0.7382108569145203, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 1444 }, { "completion_length": 167.625, "epoch": 0.36125, "grad_norm": 0.8826912045478821, "kl": 0.05429309234023094, "learning_rate": 4.0305432000861236e-06, "loss": 0.0022, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1445 }, { "completion_length": 174.25, "epoch": 0.3615, "grad_norm": 1.0357098579406738, "kl": 0.0778610110282898, "learning_rate": 4.028817600464579e-06, "loss": 0.0031, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1446 }, { "completion_length": 117.25, "epoch": 0.36175, "grad_norm": 0.7004038691520691, "kl": 0.046138960868120193, "learning_rate": 4.027090836581925e-06, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1447 }, { "completion_length": 145.125, "epoch": 0.362, "grad_norm": 0.6335418224334717, "kl": 0.05013753101229668, "learning_rate": 4.02536290975317e-06, "loss": 0.002, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1448 }, { "completion_length": 169.5, "epoch": 0.36225, "grad_norm": 1.1653978824615479, "kl": 0.04540247470140457, "learning_rate": 4.023633821294203e-06, "loss": 0.0018, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.28749996423721313, "rewards/_format_reward": 1.0, "step": 1449 }, { "completion_length": 132.375, "epoch": 0.3625, "grad_norm": 0.023695236071944237, "kl": 0.04884612187743187, "learning_rate": 4.021903572521802e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1450 }, { "completion_length": 147.5, "epoch": 0.36275, "grad_norm": 2.0923967361450195, "kl": 0.08431154489517212, "learning_rate": 4.020172164753626e-06, "loss": 0.0034, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1451 }, { "completion_length": 191.375, "epoch": 0.363, "grad_norm": 0.975801408290863, "kl": 0.09080733358860016, "learning_rate": 4.018439599308217e-06, "loss": 0.0036, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1452 }, { "completion_length": 124.5, "epoch": 0.36325, "grad_norm": 0.7811964154243469, "kl": 0.06944286823272705, "learning_rate": 4.016705877504999e-06, "loss": 0.0028, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1453 }, { "completion_length": 187.375, "epoch": 0.3635, "grad_norm": 0.9031335711479187, "kl": 0.07759063690900803, "learning_rate": 4.0149710006642775e-06, "loss": 0.0031, "reward": 1.2625000476837158, "reward_std": 0.8826704621315002, "rewards/_accuracy_reward": 0.512499988079071, "rewards/_format_reward": 0.75, "step": 1454 }, { "completion_length": 128.375, "epoch": 0.36375, "grad_norm": 1.2263360023498535, "kl": 0.0753258615732193, "learning_rate": 4.013234970107236e-06, "loss": 0.003, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1455 }, { "completion_length": 106.0, "epoch": 0.364, "grad_norm": 0.08058945089578629, "kl": 0.07110590487718582, "learning_rate": 4.011497787155938e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1456 }, { "completion_length": 172.875, "epoch": 0.36425, "grad_norm": 0.7150065898895264, "kl": 0.09918969124555588, "learning_rate": 4.009759453133322e-06, "loss": 0.004, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.875, "step": 1457 }, { "completion_length": 172.5, "epoch": 0.3645, "grad_norm": 1.6500592231750488, "kl": 0.11502061039209366, "learning_rate": 4.008019969363206e-06, "loss": 0.0046, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.875, "step": 1458 }, { "completion_length": 185.625, "epoch": 0.36475, "grad_norm": 0.7705068588256836, "kl": 0.10530376434326172, "learning_rate": 4.006279337170283e-06, "loss": 0.0042, "reward": 1.5325000286102295, "reward_std": 0.7362210750579834, "rewards/_accuracy_reward": 0.6575000286102295, "rewards/_format_reward": 0.875, "step": 1459 }, { "completion_length": 140.0, "epoch": 0.365, "grad_norm": 2.920241117477417, "kl": 0.1508193165063858, "learning_rate": 4.0045375578801216e-06, "loss": 0.006, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1460 }, { "completion_length": 148.25, "epoch": 0.36525, "grad_norm": 2.5213630199432373, "kl": 0.07739417254924774, "learning_rate": 4.002794632819159e-06, "loss": 0.0031, "reward": 1.274999976158142, "reward_std": 0.6974443197250366, "rewards/_accuracy_reward": 0.3999999761581421, "rewards/_format_reward": 0.875, "step": 1461 }, { "completion_length": 155.0, "epoch": 0.3655, "grad_norm": 2.9162180423736572, "kl": 0.13509155809879303, "learning_rate": 4.001050563314711e-06, "loss": 0.0054, "reward": 1.7825000286102295, "reward_std": 0.40780770778656006, "rewards/_accuracy_reward": 0.7825000286102295, "rewards/_format_reward": 1.0, "step": 1462 }, { "completion_length": 136.25, "epoch": 0.36575, "grad_norm": 0.4022398591041565, "kl": 0.10202545672655106, "learning_rate": 3.999305350694961e-06, "loss": 0.0041, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1463 }, { "completion_length": 137.75, "epoch": 0.366, "grad_norm": 0.6191596388816833, "kl": 0.11961022019386292, "learning_rate": 3.997558996288965e-06, "loss": 0.0048, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1464 }, { "completion_length": 188.375, "epoch": 0.36625, "grad_norm": 0.6211814284324646, "kl": 0.09365051239728928, "learning_rate": 3.995811501426648e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1465 }, { "completion_length": 181.5, "epoch": 0.3665, "grad_norm": 0.8067208528518677, "kl": 0.11959419399499893, "learning_rate": 3.994062867438803e-06, "loss": 0.0048, "reward": 1.662500023841858, "reward_std": 0.6604922413825989, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 0.875, "step": 1466 }, { "completion_length": 181.5, "epoch": 0.36675, "grad_norm": 0.5875232219696045, "kl": 0.07762034982442856, "learning_rate": 3.992313095657091e-06, "loss": 0.0031, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1467 }, { "completion_length": 160.25, "epoch": 0.367, "grad_norm": 0.6259009838104248, "kl": 0.08514675498008728, "learning_rate": 3.9905621874140396e-06, "loss": 0.0034, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1468 }, { "completion_length": 92.875, "epoch": 0.36725, "grad_norm": 0.023967457935214043, "kl": 0.02864246629178524, "learning_rate": 3.988810144043041e-06, "loss": 0.0011, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1469 }, { "completion_length": 175.875, "epoch": 0.3675, "grad_norm": 0.8310546278953552, "kl": 0.0942971259355545, "learning_rate": 3.987056966878354e-06, "loss": 0.0038, "reward": 1.6387500762939453, "reward_std": 0.49872517585754395, "rewards/_accuracy_reward": 0.6387499570846558, "rewards/_format_reward": 1.0, "step": 1470 }, { "completion_length": 161.75, "epoch": 0.36775, "grad_norm": 0.03448256105184555, "kl": 0.05922067165374756, "learning_rate": 3.985302657255097e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1471 }, { "completion_length": 141.75, "epoch": 0.368, "grad_norm": 0.02368452027440071, "kl": 0.055194176733493805, "learning_rate": 3.983547216509254e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1472 }, { "completion_length": 120.625, "epoch": 0.36825, "grad_norm": 0.04141707718372345, "kl": 0.04475046694278717, "learning_rate": 3.98179064597767e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1473 }, { "completion_length": 155.5, "epoch": 0.3685, "grad_norm": 0.587161123752594, "kl": 0.056352969259023666, "learning_rate": 3.9800329469980495e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1474 }, { "completion_length": 162.125, "epoch": 0.36875, "grad_norm": 0.6867335438728333, "kl": 0.06760282069444656, "learning_rate": 3.978274120908957e-06, "loss": 0.0027, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 1475 }, { "completion_length": 119.375, "epoch": 0.369, "grad_norm": 0.8442899584770203, "kl": 0.07957350462675095, "learning_rate": 3.976514169049814e-06, "loss": 0.0032, "reward": 1.658750057220459, "reward_std": 0.47675803303718567, "rewards/_accuracy_reward": 0.6587499976158142, "rewards/_format_reward": 1.0, "step": 1476 }, { "completion_length": 120.875, "epoch": 0.36925, "grad_norm": 0.03868165612220764, "kl": 0.05161227658390999, "learning_rate": 3.974753092760901e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1477 }, { "completion_length": 141.375, "epoch": 0.3695, "grad_norm": 0.7526807188987732, "kl": 0.058717839419841766, "learning_rate": 3.972990893383356e-06, "loss": 0.0023, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1478 }, { "completion_length": 106.875, "epoch": 0.36975, "grad_norm": 0.7434094548225403, "kl": 0.12921760976314545, "learning_rate": 3.971227572259167e-06, "loss": 0.0052, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1479 }, { "completion_length": 175.25, "epoch": 0.37, "grad_norm": 0.6596071124076843, "kl": 0.08641275018453598, "learning_rate": 3.969463130731183e-06, "loss": 0.0035, "reward": 1.15625, "reward_std": 0.6343936920166016, "rewards/_accuracy_reward": 0.28125, "rewards/_format_reward": 0.875, "step": 1480 }, { "completion_length": 208.0, "epoch": 0.37025, "grad_norm": 0.03975485637784004, "kl": 0.06477142870426178, "learning_rate": 3.9676975701431016e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1481 }, { "completion_length": 132.25, "epoch": 0.3705, "grad_norm": 0.6661024689674377, "kl": 0.058449484407901764, "learning_rate": 3.965930891839473e-06, "loss": 0.0023, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1482 }, { "completion_length": 135.875, "epoch": 0.37075, "grad_norm": 0.8266745805740356, "kl": 0.0733661875128746, "learning_rate": 3.964163097165702e-06, "loss": 0.0029, "reward": 1.0999999046325684, "reward_std": 0.09258202463388443, "rewards/_accuracy_reward": 0.10000000149011612, "rewards/_format_reward": 1.0, "step": 1483 }, { "completion_length": 114.125, "epoch": 0.371, "grad_norm": 0.01836472377181053, "kl": 0.08049993216991425, "learning_rate": 3.96239418746804e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1484 }, { "completion_length": 162.5, "epoch": 0.37125, "grad_norm": 0.5880891680717468, "kl": 0.0393814891576767, "learning_rate": 3.960624164093587e-06, "loss": 0.0016, "reward": 1.787500023841858, "reward_std": 0.39708763360977173, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1485 }, { "completion_length": 148.75, "epoch": 0.3715, "grad_norm": 0.5472244024276733, "kl": 0.039617154747247696, "learning_rate": 3.958853028390294e-06, "loss": 0.0016, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1486 }, { "completion_length": 150.875, "epoch": 0.37175, "grad_norm": 1.396892786026001, "kl": 0.06323693692684174, "learning_rate": 3.957080781706959e-06, "loss": 0.0025, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1487 }, { "completion_length": 188.875, "epoch": 0.372, "grad_norm": 0.034602127969264984, "kl": 0.06623050570487976, "learning_rate": 3.955307425393224e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1488 }, { "completion_length": 111.25, "epoch": 0.37225, "grad_norm": 0.021022455766797066, "kl": 0.08140836656093597, "learning_rate": 3.953532960799577e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1489 }, { "completion_length": 174.5, "epoch": 0.3725, "grad_norm": 0.3498646914958954, "kl": 0.04529913514852524, "learning_rate": 3.951757389277349e-06, "loss": 0.0018, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1490 }, { "completion_length": 172.25, "epoch": 0.37275, "grad_norm": 0.4678153991699219, "kl": 0.04285871610045433, "learning_rate": 3.949980712178718e-06, "loss": 0.0017, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1491 }, { "completion_length": 112.125, "epoch": 0.373, "grad_norm": 0.8213009834289551, "kl": 0.0340723879635334, "learning_rate": 3.948202930856697e-06, "loss": 0.0014, "reward": 1.756250023841858, "reward_std": 0.6894291639328003, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 1492 }, { "completion_length": 178.75, "epoch": 0.37325, "grad_norm": 0.7336386442184448, "kl": 0.05348341166973114, "learning_rate": 3.946424046665147e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1493 }, { "completion_length": 123.625, "epoch": 0.3735, "grad_norm": 0.9568604230880737, "kl": 0.06122196465730667, "learning_rate": 3.944644060958764e-06, "loss": 0.0024, "reward": 1.5625, "reward_std": 0.7165144085884094, "rewards/_accuracy_reward": 0.6875, "rewards/_format_reward": 0.875, "step": 1494 }, { "completion_length": 191.25, "epoch": 0.37375, "grad_norm": 0.7187319993972778, "kl": 0.03870750963687897, "learning_rate": 3.942862975093085e-06, "loss": 0.0015, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1495 }, { "completion_length": 116.5, "epoch": 0.374, "grad_norm": 0.014674518257379532, "kl": 0.07812533527612686, "learning_rate": 3.941080790424483e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1496 }, { "completion_length": 140.625, "epoch": 0.37425, "grad_norm": 0.6387913823127747, "kl": 0.03343196585774422, "learning_rate": 3.939297508310172e-06, "loss": 0.0013, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1497 }, { "completion_length": 182.125, "epoch": 0.3745, "grad_norm": 0.4836607277393341, "kl": 0.03932333365082741, "learning_rate": 3.9375131301081974e-06, "loss": 0.0016, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1498 }, { "completion_length": 160.75, "epoch": 0.37475, "grad_norm": 0.6728492379188538, "kl": 0.043070126324892044, "learning_rate": 3.935727657177439e-06, "loss": 0.0017, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.6375000476837158, "rewards/_format_reward": 0.875, "step": 1499 }, { "completion_length": 191.75, "epoch": 0.375, "grad_norm": 0.5258508920669556, "kl": 0.03584269806742668, "learning_rate": 3.933941090877615e-06, "loss": 0.0014, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1500 }, { "completion_length": 94.25, "epoch": 0.37525, "grad_norm": 0.016977330669760704, "kl": 0.021302910521626472, "learning_rate": 3.932153432569273e-06, "loss": 0.0009, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1501 }, { "completion_length": 82.375, "epoch": 0.3755, "grad_norm": 0.9392129778862, "kl": 0.0853070318698883, "learning_rate": 3.930364683613791e-06, "loss": 0.0034, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1502 }, { "completion_length": 150.75, "epoch": 0.37575, "grad_norm": 0.03836243599653244, "kl": 0.0422300361096859, "learning_rate": 3.92857484537338e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1503 }, { "completion_length": 190.0, "epoch": 0.376, "grad_norm": 0.571064829826355, "kl": 0.048360809683799744, "learning_rate": 3.92678391921108e-06, "loss": 0.0019, "reward": 1.5187499523162842, "reward_std": 0.5147382020950317, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 0.875, "step": 1504 }, { "completion_length": 112.375, "epoch": 0.37625, "grad_norm": 0.7876760363578796, "kl": 0.04685162380337715, "learning_rate": 3.924991906490758e-06, "loss": 0.0019, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.643750011920929, "rewards/_format_reward": 1.0, "step": 1505 }, { "completion_length": 185.25, "epoch": 0.3765, "grad_norm": 0.451773077249527, "kl": 0.034047823399305344, "learning_rate": 3.923198808577111e-06, "loss": 0.0014, "reward": 1.537500023841858, "reward_std": 0.7322909235954285, "rewards/_accuracy_reward": 0.6625000238418579, "rewards/_format_reward": 0.875, "step": 1506 }, { "completion_length": 115.625, "epoch": 0.37675, "grad_norm": 0.6053922176361084, "kl": 0.058970190584659576, "learning_rate": 3.921404626835661e-06, "loss": 0.0024, "reward": 1.787500023841858, "reward_std": 0.39708760380744934, "rewards/_accuracy_reward": 0.7875000238418579, "rewards/_format_reward": 1.0, "step": 1507 }, { "completion_length": 195.625, "epoch": 0.377, "grad_norm": 0.6031469702720642, "kl": 0.041082918643951416, "learning_rate": 3.9196093626327535e-06, "loss": 0.0016, "reward": 1.0187499523162842, "reward_std": 0.9184371829032898, "rewards/_accuracy_reward": 0.39374998211860657, "rewards/_format_reward": 0.625, "step": 1508 }, { "completion_length": 165.5, "epoch": 0.37725, "grad_norm": 0.4825488328933716, "kl": 0.02711324580013752, "learning_rate": 3.917813017335562e-06, "loss": 0.0011, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1509 }, { "completion_length": 189.75, "epoch": 0.3775, "grad_norm": 0.6947501301765442, "kl": 0.04373620077967644, "learning_rate": 3.916015592312083e-06, "loss": 0.0017, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1510 }, { "completion_length": 156.5, "epoch": 0.37775, "grad_norm": 0.613882303237915, "kl": 0.045684535056352615, "learning_rate": 3.9142170889311305e-06, "loss": 0.0018, "reward": 1.693750023841858, "reward_std": 0.42714792490005493, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 1.0, "step": 1511 }, { "completion_length": 177.375, "epoch": 0.378, "grad_norm": 0.01667964830994606, "kl": 0.03237161785364151, "learning_rate": 3.912417508562345e-06, "loss": 0.0013, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1512 }, { "completion_length": 198.5, "epoch": 0.37825, "grad_norm": 0.6279807686805725, "kl": 0.05605170503258705, "learning_rate": 3.910616852576186e-06, "loss": 0.0022, "reward": 1.21875, "reward_std": 0.8284828662872314, "rewards/_accuracy_reward": 0.46875, "rewards/_format_reward": 0.75, "step": 1513 }, { "completion_length": 101.625, "epoch": 0.3785, "grad_norm": 0.06471621990203857, "kl": 0.05444112420082092, "learning_rate": 3.908815122343929e-06, "loss": 0.0022, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1514 }, { "completion_length": 142.25, "epoch": 0.37875, "grad_norm": 0.6215490698814392, "kl": 0.04951619729399681, "learning_rate": 3.907012319237672e-06, "loss": 0.002, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1515 }, { "completion_length": 179.875, "epoch": 0.379, "grad_norm": 0.561939001083374, "kl": 0.0519590750336647, "learning_rate": 3.905208444630326e-06, "loss": 0.0021, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1516 }, { "completion_length": 126.125, "epoch": 0.37925, "grad_norm": 0.022524600848555565, "kl": 0.04927676171064377, "learning_rate": 3.903403499895624e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1517 }, { "completion_length": 147.875, "epoch": 0.3795, "grad_norm": 0.5980425477027893, "kl": 0.03771920129656792, "learning_rate": 3.901597486408105e-06, "loss": 0.0015, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1518 }, { "completion_length": 130.625, "epoch": 0.37975, "grad_norm": 0.9228925704956055, "kl": 0.04406380653381348, "learning_rate": 3.899790405543129e-06, "loss": 0.0018, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 1519 }, { "completion_length": 128.625, "epoch": 0.38, "grad_norm": 0.02237652614712715, "kl": 0.05214867368340492, "learning_rate": 3.897982258676867e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1520 }, { "completion_length": 152.375, "epoch": 0.38025, "grad_norm": 0.6844654679298401, "kl": 0.05311375483870506, "learning_rate": 3.896173047186302e-06, "loss": 0.0021, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.2874999940395355, "rewards/_format_reward": 1.0, "step": 1521 }, { "completion_length": 188.75, "epoch": 0.3805, "grad_norm": 0.5941595435142517, "kl": 0.06788572669029236, "learning_rate": 3.894362772449226e-06, "loss": 0.0027, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1522 }, { "completion_length": 142.875, "epoch": 0.38075, "grad_norm": 0.7482618689537048, "kl": 0.05281698703765869, "learning_rate": 3.892551435844242e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1523 }, { "completion_length": 122.625, "epoch": 0.381, "grad_norm": 0.6537103652954102, "kl": 0.03658046945929527, "learning_rate": 3.890739038750763e-06, "loss": 0.0015, "reward": 1.5199999809265137, "reward_std": 0.513308584690094, "rewards/_accuracy_reward": 0.5199999809265137, "rewards/_format_reward": 1.0, "step": 1524 }, { "completion_length": 94.625, "epoch": 0.38125, "grad_norm": 0.024736473336815834, "kl": 0.029560457915067673, "learning_rate": 3.888925582549006e-06, "loss": 0.0012, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1525 }, { "completion_length": 164.75, "epoch": 0.3815, "grad_norm": 0.03351657837629318, "kl": 0.06395187973976135, "learning_rate": 3.887111068619999e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1526 }, { "completion_length": 129.5, "epoch": 0.38175, "grad_norm": 0.6471255421638489, "kl": 0.08158797770738602, "learning_rate": 3.885295498345572e-06, "loss": 0.0033, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1527 }, { "completion_length": 178.25, "epoch": 0.382, "grad_norm": 0.6274938583374023, "kl": 0.04700816795229912, "learning_rate": 3.88347887310836e-06, "loss": 0.0019, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1528 }, { "completion_length": 132.125, "epoch": 0.38225, "grad_norm": 0.7165562510490417, "kl": 0.058674897998571396, "learning_rate": 3.881661194291805e-06, "loss": 0.0023, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1529 }, { "completion_length": 162.25, "epoch": 0.3825, "grad_norm": 0.027459675446152687, "kl": 0.06130118668079376, "learning_rate": 3.879842463280146e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1530 }, { "completion_length": 150.375, "epoch": 0.38275, "grad_norm": 0.031655825674533844, "kl": 0.05282329395413399, "learning_rate": 3.8780226814584265e-06, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1531 }, { "completion_length": 187.75, "epoch": 0.383, "grad_norm": 0.5729331374168396, "kl": 0.06790616363286972, "learning_rate": 3.876201850212489e-06, "loss": 0.0027, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1532 }, { "completion_length": 165.125, "epoch": 0.38325, "grad_norm": 0.02087417244911194, "kl": 0.038140442222356796, "learning_rate": 3.874379970928977e-06, "loss": 0.0015, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1533 }, { "completion_length": 182.5, "epoch": 0.3835, "grad_norm": 0.5951360464096069, "kl": 0.048477813601493835, "learning_rate": 3.87255704499533e-06, "loss": 0.0019, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1534 }, { "completion_length": 123.5, "epoch": 0.38375, "grad_norm": 0.7089921832084656, "kl": 0.04717721790075302, "learning_rate": 3.870733073799785e-06, "loss": 0.0019, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1535 }, { "completion_length": 182.75, "epoch": 0.384, "grad_norm": 0.029666420072317123, "kl": 0.06745561957359314, "learning_rate": 3.868908058731376e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1536 }, { "completion_length": 172.25, "epoch": 0.38425, "grad_norm": 0.719944953918457, "kl": 0.05279861390590668, "learning_rate": 3.867082001179932e-06, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1537 }, { "completion_length": 162.25, "epoch": 0.3845, "grad_norm": 0.023668771609663963, "kl": 0.047101955860853195, "learning_rate": 3.865254902536073e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1538 }, { "completion_length": 154.125, "epoch": 0.38475, "grad_norm": 0.5054484009742737, "kl": 0.05012737214565277, "learning_rate": 3.863426764191216e-06, "loss": 0.002, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1539 }, { "completion_length": 187.5, "epoch": 0.385, "grad_norm": 0.5208625793457031, "kl": 0.04712379723787308, "learning_rate": 3.861597587537568e-06, "loss": 0.0019, "reward": 1.2687499523162842, "reward_std": 0.6999680995941162, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.75, "step": 1540 }, { "completion_length": 117.5, "epoch": 0.38525, "grad_norm": 0.6336519718170166, "kl": 0.07700920104980469, "learning_rate": 3.8597673739681265e-06, "loss": 0.0031, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/_accuracy_reward": 0.8125, "rewards/_format_reward": 1.0, "step": 1541 }, { "completion_length": 190.375, "epoch": 0.3855, "grad_norm": 0.5653288960456848, "kl": 0.05022215470671654, "learning_rate": 3.857936124876677e-06, "loss": 0.002, "reward": 1.3937499523162842, "reward_std": 0.7336004972457886, "rewards/_accuracy_reward": 0.5187499523162842, "rewards/_format_reward": 0.875, "step": 1542 }, { "completion_length": 174.25, "epoch": 0.38575, "grad_norm": 0.7070077061653137, "kl": 0.056165359914302826, "learning_rate": 3.856103841657797e-06, "loss": 0.0022, "reward": 1.756250023841858, "reward_std": 0.45153507590293884, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 0.875, "step": 1543 }, { "completion_length": 190.625, "epoch": 0.386, "grad_norm": 0.03828784078359604, "kl": 0.0686645433306694, "learning_rate": 3.85427052570685e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1544 }, { "completion_length": 167.0, "epoch": 0.38625, "grad_norm": 0.5596043467521667, "kl": 0.052377849817276, "learning_rate": 3.8524361784199855e-06, "loss": 0.0021, "reward": 1.6262500286102295, "reward_std": 0.7428312301635742, "rewards/_accuracy_reward": 0.7512500286102295, "rewards/_format_reward": 0.875, "step": 1545 }, { "completion_length": 158.0, "epoch": 0.3865, "grad_norm": 0.03330124169588089, "kl": 0.03650851547718048, "learning_rate": 3.850600801194138e-06, "loss": 0.0015, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1546 }, { "completion_length": 128.5, "epoch": 0.38675, "grad_norm": 0.5438423752784729, "kl": 0.03633885830640793, "learning_rate": 3.8487643954270274e-06, "loss": 0.0015, "reward": 1.8762500286102295, "reward_std": 0.35001784563064575, "rewards/_accuracy_reward": 0.8762500286102295, "rewards/_format_reward": 1.0, "step": 1547 }, { "completion_length": 149.75, "epoch": 0.387, "grad_norm": 0.6018542647361755, "kl": 0.05037950351834297, "learning_rate": 3.846926962517158e-06, "loss": 0.002, "reward": 1.6437499523162842, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.6437499523162842, "rewards/_format_reward": 1.0, "step": 1548 }, { "completion_length": 130.875, "epoch": 0.38725, "grad_norm": 0.7819284796714783, "kl": 0.059603843837976456, "learning_rate": 3.845088503863813e-06, "loss": 0.0024, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 1549 }, { "completion_length": 175.125, "epoch": 0.3875, "grad_norm": 0.5813125371932983, "kl": 0.06442350149154663, "learning_rate": 3.8432490208670605e-06, "loss": 0.0026, "reward": 1.0374999046325684, "reward_std": 0.5350233912467957, "rewards/_accuracy_reward": 0.16249999403953552, "rewards/_format_reward": 0.875, "step": 1550 }, { "completion_length": 182.25, "epoch": 0.38775, "grad_norm": 0.6130213737487793, "kl": 0.06105250120162964, "learning_rate": 3.8414085149277445e-06, "loss": 0.0024, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/_accuracy_reward": 0.65625, "rewards/_format_reward": 0.75, "step": 1551 }, { "completion_length": 138.875, "epoch": 0.388, "grad_norm": 0.0351678803563118, "kl": 0.05961094796657562, "learning_rate": 3.839566987447492e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1552 }, { "completion_length": 178.0, "epoch": 0.38825, "grad_norm": 0.5075997114181519, "kl": 0.05547872185707092, "learning_rate": 3.8377244398287065e-06, "loss": 0.0022, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1553 }, { "completion_length": 123.625, "epoch": 0.3885, "grad_norm": 0.6791297793388367, "kl": 0.0307548139244318, "learning_rate": 3.835880873474567e-06, "loss": 0.0012, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1554 }, { "completion_length": 159.5, "epoch": 0.38875, "grad_norm": 0.021484725177288055, "kl": 0.04366430640220642, "learning_rate": 3.83403628978903e-06, "loss": 0.0017, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1555 }, { "completion_length": 148.875, "epoch": 0.389, "grad_norm": 0.501158595085144, "kl": 0.06199439615011215, "learning_rate": 3.832190690176825e-06, "loss": 0.0025, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1556 }, { "completion_length": 142.75, "epoch": 0.38925, "grad_norm": 0.5822856426239014, "kl": 0.04813998565077782, "learning_rate": 3.830344076043459e-06, "loss": 0.0019, "reward": 1.7574999332427979, "reward_std": 0.4491499960422516, "rewards/_accuracy_reward": 0.7575000524520874, "rewards/_format_reward": 1.0, "step": 1557 }, { "completion_length": 178.0, "epoch": 0.3895, "grad_norm": 0.517787516117096, "kl": 0.07786907255649567, "learning_rate": 3.828496448795208e-06, "loss": 0.0031, "reward": 1.65625, "reward_std": 0.7188470363616943, "rewards/_accuracy_reward": 0.78125, "rewards/_format_reward": 0.875, "step": 1558 }, { "completion_length": 167.375, "epoch": 0.38975, "grad_norm": 0.5909892320632935, "kl": 0.05643405765295029, "learning_rate": 3.826647809839119e-06, "loss": 0.0023, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1559 }, { "completion_length": 133.125, "epoch": 0.39, "grad_norm": 0.6651085019111633, "kl": 0.05972522869706154, "learning_rate": 3.824798160583012e-06, "loss": 0.0024, "reward": 1.40625, "reward_std": 0.49167174100875854, "rewards/_accuracy_reward": 0.4062499701976776, "rewards/_format_reward": 1.0, "step": 1560 }, { "completion_length": 102.75, "epoch": 0.39025, "grad_norm": 0.022245794534683228, "kl": 0.039669353514909744, "learning_rate": 3.822947502435477e-06, "loss": 0.0016, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1561 }, { "completion_length": 186.75, "epoch": 0.3905, "grad_norm": 0.6299328207969666, "kl": 0.07042766362428665, "learning_rate": 3.821095836805868e-06, "loss": 0.0028, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.637499988079071, "rewards/_format_reward": 0.875, "step": 1562 }, { "completion_length": 171.5, "epoch": 0.39075, "grad_norm": 0.026600031182169914, "kl": 0.05926269665360451, "learning_rate": 3.819243165104311e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1563 }, { "completion_length": 126.625, "epoch": 0.391, "grad_norm": 0.6196228861808777, "kl": 0.04901750385761261, "learning_rate": 3.817389488741694e-06, "loss": 0.002, "reward": 1.631250023841858, "reward_std": 0.738210916519165, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.875, "step": 1564 }, { "completion_length": 133.25, "epoch": 0.39125, "grad_norm": 0.6434496641159058, "kl": 0.0704207792878151, "learning_rate": 3.815534809129674e-06, "loss": 0.0028, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1565 }, { "completion_length": 95.375, "epoch": 0.3915, "grad_norm": 0.6863855719566345, "kl": 0.07160824537277222, "learning_rate": 3.8136791276806695e-06, "loss": 0.0029, "reward": 1.568750023841858, "reward_std": 0.4689939618110657, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 0.875, "step": 1566 }, { "completion_length": 97.5, "epoch": 0.39175, "grad_norm": 0.8386607766151428, "kl": 0.050130948424339294, "learning_rate": 3.8118224458078633e-06, "loss": 0.002, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/_accuracy_reward": 0.71875, "rewards/_format_reward": 1.0, "step": 1567 }, { "completion_length": 185.625, "epoch": 0.392, "grad_norm": 0.7192432284355164, "kl": 0.07506588846445084, "learning_rate": 3.8099647649251984e-06, "loss": 0.003, "reward": 1.381250023841858, "reward_std": 0.9133679866790771, "rewards/_accuracy_reward": 0.6312500238418579, "rewards/_format_reward": 0.75, "step": 1568 }, { "completion_length": 197.25, "epoch": 0.39225, "grad_norm": 0.5353832244873047, "kl": 0.0634760856628418, "learning_rate": 3.8081060864473794e-06, "loss": 0.0025, "reward": 1.1387499570846558, "reward_std": 0.8324737548828125, "rewards/_accuracy_reward": 0.38874998688697815, "rewards/_format_reward": 0.75, "step": 1569 }, { "completion_length": 117.125, "epoch": 0.3925, "grad_norm": 0.019449777901172638, "kl": 0.057256847620010376, "learning_rate": 3.806246411789872e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1570 }, { "completion_length": 195.25, "epoch": 0.39275, "grad_norm": 0.6402668356895447, "kl": 0.0625515803694725, "learning_rate": 3.8043857423688995e-06, "loss": 0.0025, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/_accuracy_reward": 0.875, "rewards/_format_reward": 0.875, "step": 1571 }, { "completion_length": 119.75, "epoch": 0.393, "grad_norm": 0.6904266476631165, "kl": 0.04785230755805969, "learning_rate": 3.802524079601442e-06, "loss": 0.0019, "reward": 1.693750023841858, "reward_std": 0.4271479547023773, "rewards/_accuracy_reward": 0.6937500238418579, "rewards/_format_reward": 1.0, "step": 1572 }, { "completion_length": 130.75, "epoch": 0.39325, "grad_norm": 0.5492718815803528, "kl": 0.039260704070329666, "learning_rate": 3.8006614249052353e-06, "loss": 0.0016, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1573 }, { "completion_length": 161.25, "epoch": 0.3935, "grad_norm": 0.030837608501315117, "kl": 0.07404318451881409, "learning_rate": 3.798797779698774e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1574 }, { "completion_length": 123.625, "epoch": 0.39375, "grad_norm": 0.02218150906264782, "kl": 0.07550845295190811, "learning_rate": 3.796933145401304e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1575 }, { "completion_length": 94.0, "epoch": 0.394, "grad_norm": 0.9086791276931763, "kl": 0.055962108075618744, "learning_rate": 3.795067523432826e-06, "loss": 0.0022, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1576 }, { "completion_length": 158.125, "epoch": 0.39425, "grad_norm": 0.7055941224098206, "kl": 0.05105192959308624, "learning_rate": 3.7932009152140926e-06, "loss": 0.002, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1577 }, { "completion_length": 153.875, "epoch": 0.3945, "grad_norm": 0.6992962956428528, "kl": 0.06621918827295303, "learning_rate": 3.791333322166605e-06, "loss": 0.0026, "reward": 1.5125000476837158, "reward_std": 0.7467787861824036, "rewards/_accuracy_reward": 0.6375000476837158, "rewards/_format_reward": 0.875, "step": 1578 }, { "completion_length": 111.875, "epoch": 0.39475, "grad_norm": 0.025215838104486465, "kl": 0.04421816021203995, "learning_rate": 3.7894647457126188e-06, "loss": 0.0018, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1579 }, { "completion_length": 139.0, "epoch": 0.395, "grad_norm": 0.020408930256962776, "kl": 0.04933981969952583, "learning_rate": 3.787595187275136e-06, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1580 }, { "completion_length": 165.5, "epoch": 0.39525, "grad_norm": 0.030607668682932854, "kl": 0.060726772993803024, "learning_rate": 3.7857246482779052e-06, "loss": 0.0024, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1581 }, { "completion_length": 150.875, "epoch": 0.3955, "grad_norm": 0.6549018621444702, "kl": 0.0552009716629982, "learning_rate": 3.7838531301454257e-06, "loss": 0.0022, "reward": 1.7625000476837158, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 1.0, "step": 1582 }, { "completion_length": 168.625, "epoch": 0.39575, "grad_norm": 0.7822985649108887, "kl": 0.07485631853342056, "learning_rate": 3.7819806343029373e-06, "loss": 0.003, "reward": 1.881250023841858, "reward_std": 0.3358757197856903, "rewards/_accuracy_reward": 0.8812500238418579, "rewards/_format_reward": 1.0, "step": 1583 }, { "completion_length": 128.0, "epoch": 0.396, "grad_norm": 0.709174633026123, "kl": 0.04844846948981285, "learning_rate": 3.780107162176429e-06, "loss": 0.0019, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1584 }, { "completion_length": 137.75, "epoch": 0.39625, "grad_norm": 0.696996808052063, "kl": 0.04638068005442619, "learning_rate": 3.77823271519263e-06, "loss": 0.0019, "reward": 1.6387500762939453, "reward_std": 0.49872517585754395, "rewards/_accuracy_reward": 0.6387499570846558, "rewards/_format_reward": 1.0, "step": 1585 }, { "completion_length": 124.5, "epoch": 0.3965, "grad_norm": 0.02669079601764679, "kl": 0.05704856663942337, "learning_rate": 3.776357294779015e-06, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1586 }, { "completion_length": 163.375, "epoch": 0.39675, "grad_norm": 0.6654564738273621, "kl": 0.0693705603480339, "learning_rate": 3.774480902363795e-06, "loss": 0.0028, "reward": 1.4187500476837158, "reward_std": 0.7235515117645264, "rewards/_accuracy_reward": 0.5437500476837158, "rewards/_format_reward": 0.875, "step": 1587 }, { "completion_length": 143.5, "epoch": 0.397, "grad_norm": 0.5335232615470886, "kl": 0.04081644117832184, "learning_rate": 3.772603539375929e-06, "loss": 0.0016, "reward": 1.6375000476837158, "reward_std": 0.5005354285240173, "rewards/_accuracy_reward": 0.762499988079071, "rewards/_format_reward": 0.875, "step": 1588 }, { "completion_length": 113.625, "epoch": 0.39725, "grad_norm": 0.7147615551948547, "kl": 0.04324035719037056, "learning_rate": 3.770725207245106e-06, "loss": 0.0017, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/_accuracy_reward": 0.90625, "rewards/_format_reward": 1.0, "step": 1589 }, { "completion_length": 154.0, "epoch": 0.3975, "grad_norm": 0.6111875772476196, "kl": 0.0737532302737236, "learning_rate": 3.768845907401761e-06, "loss": 0.003, "reward": 1.2874999046325684, "reward_std": 0.4397645592689514, "rewards/_accuracy_reward": 0.28749996423721313, "rewards/_format_reward": 1.0, "step": 1590 }, { "completion_length": 143.875, "epoch": 0.39775, "grad_norm": 0.6925716996192932, "kl": 0.0703587755560875, "learning_rate": 3.7669656412770605e-06, "loss": 0.0028, "reward": 1.0587499141693115, "reward_std": 0.5271334052085876, "rewards/_accuracy_reward": 0.1837500035762787, "rewards/_format_reward": 0.875, "step": 1591 }, { "completion_length": 157.875, "epoch": 0.398, "grad_norm": 0.5909250974655151, "kl": 0.04481671750545502, "learning_rate": 3.7650844103029093e-06, "loss": 0.0018, "reward": 1.5199999809265137, "reward_std": 0.5133086442947388, "rewards/_accuracy_reward": 0.5199999809265137, "rewards/_format_reward": 1.0, "step": 1592 }, { "completion_length": 173.75, "epoch": 0.39825, "grad_norm": 0.041420962661504745, "kl": 0.07704256474971771, "learning_rate": 3.763202215911948e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1593 }, { "completion_length": 123.0, "epoch": 0.3985, "grad_norm": 0.033740244805812836, "kl": 0.046596985310316086, "learning_rate": 3.7613190595375484e-06, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1594 }, { "completion_length": 173.5, "epoch": 0.39875, "grad_norm": 0.6274866461753845, "kl": 0.06995417177677155, "learning_rate": 3.759434942613816e-06, "loss": 0.0028, "reward": 1.524999976158142, "reward_std": 0.5077964067459106, "rewards/_accuracy_reward": 0.5249999761581421, "rewards/_format_reward": 1.0, "step": 1595 }, { "completion_length": 133.25, "epoch": 0.399, "grad_norm": 0.02433399297297001, "kl": 0.06196637451648712, "learning_rate": 3.7575498665755884e-06, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 1.0, "step": 1596 }, { "completion_length": 171.75, "epoch": 0.39925, "grad_norm": 0.5644596815109253, "kl": 0.07158312946557999, "learning_rate": 3.7556638328584314e-06, "loss": 0.0029, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/_accuracy_reward": 0.75, "rewards/_format_reward": 0.75, "step": 1597 }, { "completion_length": 140.75, "epoch": 0.3995, "grad_norm": 0.6017252802848816, "kl": 0.058312345296144485, "learning_rate": 3.753776842898644e-06, "loss": 0.0023, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/_accuracy_reward": 1.0, "rewards/_format_reward": 0.875, "step": 1598 }, { "completion_length": 162.125, "epoch": 0.39975, "grad_norm": 0.6574504375457764, "kl": 0.06590811908245087, "learning_rate": 3.751888898133249e-06, "loss": 0.0026, "reward": 1.3125, "reward_std": 0.4299086630344391, "rewards/_accuracy_reward": 0.3124999701976776, "rewards/_format_reward": 1.0, "step": 1599 }, { "completion_length": 179.0, "epoch": 0.4, "grad_norm": 0.533219575881958, "kl": 0.07962776720523834, "learning_rate": 3.7500000000000005e-06, "loss": 0.0032, "reward": 1.506250023841858, "reward_std": 0.7513975501060486, "rewards/_accuracy_reward": 0.7562500238418579, "rewards/_format_reward": 0.75, "step": 1600 } ], "logging_steps": 1, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }