diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4, + "eval_steps": 500, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 254.375, + "epoch": 0.00025, + "grad_norm": 0.37493768334388733, + "kl": 0.0, + "learning_rate": 1.2500000000000001e-08, + "loss": -0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 1 + }, + { + "completion_length": 220.875, + "epoch": 0.0005, + "grad_norm": 0.37041813135147095, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": -0.0, + "reward": 0.7687499523162842, + "reward_std": 0.7151111364364624, + "rewards/_accuracy_reward": 0.26874998211860657, + "rewards/_format_reward": 0.5, + "step": 2 + }, + { + "completion_length": 225.5, + "epoch": 0.00075, + "grad_norm": 0.37694671750068665, + "kl": 0.0006723726983182132, + "learning_rate": 3.7500000000000005e-08, + "loss": 0.0, + "reward": 0.7875000238418579, + "reward_std": 0.9026746153831482, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 0.5, + "step": 3 + }, + { + "completion_length": 238.875, + "epoch": 0.001, + "grad_norm": 0.34139618277549744, + "kl": 0.0005685070063918829, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 0.375, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.125, + "step": 4 + }, + { + "completion_length": 179.75, + "epoch": 0.00125, + "grad_norm": 0.3876967430114746, + "kl": 0.0005563868908211589, + "learning_rate": 6.250000000000001e-08, + "loss": 0.0, + "reward": 1.125, + "reward_std": 0.9910312294960022, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.5, + "step": 5 + }, + { + "completion_length": 210.5, + "epoch": 0.0015, + "grad_norm": 0.32642900943756104, + "kl": 0.0006381691200658679, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 6 + }, + { + "completion_length": 114.875, + "epoch": 0.00175, + "grad_norm": 0.63201904296875, + "kl": 0.00071949657285586, + "learning_rate": 8.750000000000001e-08, + "loss": 0.0, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.625, + "step": 7 + }, + { + "completion_length": 233.375, + "epoch": 0.002, + "grad_norm": 0.6539371013641357, + "kl": 0.0006772859487682581, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 0.13749998807907104, + "reward_std": 0.3691205382347107, + "rewards/_accuracy_reward": 0.012500000186264515, + "rewards/_format_reward": 0.125, + "step": 8 + }, + { + "completion_length": 201.875, + "epoch": 0.00225, + "grad_norm": 0.45297423005104065, + "kl": 0.0006543596973642707, + "learning_rate": 1.1250000000000001e-07, + "loss": 0.0, + "reward": 0.875, + "reward_std": 0.9910312294960022, + "rewards/_accuracy_reward": 0.5, + "rewards/_format_reward": 0.375, + "step": 9 + }, + { + "completion_length": 194.5, + "epoch": 0.0025, + "grad_norm": 0.4227246046066284, + "kl": 0.000667984364554286, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 0.71875, + "reward_std": 0.8066409826278687, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.5, + "step": 10 + }, + { + "completion_length": 243.0, + "epoch": 0.00275, + "grad_norm": 0.0012034112587571144, + "kl": 0.0005753615405410528, + "learning_rate": 1.375e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.0, + "rewards/_format_reward": 0.0, + "step": 11 + }, + { + "completion_length": 209.625, + "epoch": 0.003, + "grad_norm": 0.3236827254295349, + "kl": 0.0004996673669666052, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.574999988079071, + "reward_std": 0.619331419467926, + "rewards/_accuracy_reward": 0.07500000298023224, + "rewards/_format_reward": 0.5, + "step": 12 + }, + { + "completion_length": 226.375, + "epoch": 0.00325, + "grad_norm": 0.3909885287284851, + "kl": 0.0006868684431537986, + "learning_rate": 1.625e-07, + "loss": 0.0, + "reward": 0.34375, + "reward_std": 0.4988826811313629, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.125, + "step": 13 + }, + { + "completion_length": 232.0, + "epoch": 0.0035, + "grad_norm": 0.2995592951774597, + "kl": 0.0005955615197308362, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.810092568397522, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.375, + "step": 14 + }, + { + "completion_length": 172.625, + "epoch": 0.00375, + "grad_norm": 0.5309815406799316, + "kl": 0.0006232442683540285, + "learning_rate": 1.875e-07, + "loss": 0.0, + "reward": 0.6312500238418579, + "reward_std": 0.9192144870758057, + "rewards/_accuracy_reward": 0.2562499940395355, + "rewards/_format_reward": 0.375, + "step": 15 + }, + { + "completion_length": 123.375, + "epoch": 0.004, + "grad_norm": 0.3811752200126648, + "kl": 0.000560120097361505, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.96875, + "reward_std": 0.5250425338745117, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.75, + "step": 16 + }, + { + "completion_length": 226.125, + "epoch": 0.00425, + "grad_norm": 0.3893718123435974, + "kl": 0.0008009783923625946, + "learning_rate": 2.1250000000000003e-07, + "loss": 0.0, + "reward": 0.4437499940395355, + "reward_std": 0.6155354380607605, + "rewards/_accuracy_reward": 0.06875000149011612, + "rewards/_format_reward": 0.375, + "step": 17 + }, + { + "completion_length": 204.25, + "epoch": 0.0045, + "grad_norm": 0.3759849965572357, + "kl": 0.0006915747653692961, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.7647361755371094, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 0.25, + "step": 18 + }, + { + "completion_length": 231.0, + "epoch": 0.00475, + "grad_norm": 0.34655511379241943, + "kl": 0.000734607398044318, + "learning_rate": 2.3750000000000003e-07, + "loss": 0.0, + "reward": 0.53125, + "reward_std": 0.9106267690658569, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.25, + "step": 19 + }, + { + "completion_length": 244.25, + "epoch": 0.005, + "grad_norm": 0.32063984870910645, + "kl": 0.0005592820816673338, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 20 + }, + { + "completion_length": 237.625, + "epoch": 0.00525, + "grad_norm": 0.3625517189502716, + "kl": 0.0005329761188477278, + "learning_rate": 2.6250000000000003e-07, + "loss": 0.0, + "reward": 0.1875, + "reward_std": 0.4381372928619385, + "rewards/_accuracy_reward": 0.0625, + "rewards/_format_reward": 0.125, + "step": 21 + }, + { + "completion_length": 240.5, + "epoch": 0.0055, + "grad_norm": 0.39223137497901917, + "kl": 0.0006619459018111229, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 0.375, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.125, + "step": 22 + }, + { + "completion_length": 243.5, + "epoch": 0.00575, + "grad_norm": 0.4175410866737366, + "kl": 0.0005919833201915026, + "learning_rate": 2.8750000000000005e-07, + "loss": 0.0, + "reward": 0.13124999403953552, + "reward_std": 0.3712310194969177, + "rewards/_accuracy_reward": 0.0062500000931322575, + "rewards/_format_reward": 0.125, + "step": 23 + }, + { + "completion_length": 221.625, + "epoch": 0.006, + "grad_norm": 0.3331623673439026, + "kl": 0.0006004280294291675, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 0.13124999403953552, + "reward_std": 0.3712310194969177, + "rewards/_accuracy_reward": 0.0062500000931322575, + "rewards/_format_reward": 0.125, + "step": 24 + }, + { + "completion_length": 235.0, + "epoch": 0.00625, + "grad_norm": 0.3354165852069855, + "kl": 0.0007247485918924212, + "learning_rate": 3.125e-07, + "loss": 0.0, + "reward": 0.5187499523162842, + "reward_std": 0.554808497428894, + "rewards/_accuracy_reward": 0.14374999701976776, + "rewards/_format_reward": 0.375, + "step": 25 + }, + { + "completion_length": 183.625, + "epoch": 0.0065, + "grad_norm": 0.36789894104003906, + "kl": 0.0005746211390942335, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 0.65625, + "reward_std": 0.6399986147880554, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.5, + "step": 26 + }, + { + "completion_length": 249.75, + "epoch": 0.00675, + "grad_norm": 0.34738096594810486, + "kl": 0.0005474050994962454, + "learning_rate": 3.3750000000000005e-07, + "loss": 0.0, + "reward": 0.3125, + "reward_std": 0.5786375403404236, + "rewards/_accuracy_reward": 0.0625, + "rewards/_format_reward": 0.25, + "step": 27 + }, + { + "completion_length": 225.125, + "epoch": 0.007, + "grad_norm": 0.40637290477752686, + "kl": 0.0006180583732202649, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 28 + }, + { + "completion_length": 220.375, + "epoch": 0.00725, + "grad_norm": 0.3404392600059509, + "kl": 0.0006671351147815585, + "learning_rate": 3.625e-07, + "loss": 0.0, + "reward": 0.875, + "reward_std": 0.9910312294960022, + "rewards/_accuracy_reward": 0.5, + "rewards/_format_reward": 0.375, + "step": 29 + }, + { + "completion_length": 217.75, + "epoch": 0.0075, + "grad_norm": 0.3346198797225952, + "kl": 0.0005711165722459555, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.28125, + "reward_std": 0.5250425338745117, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.125, + "step": 30 + }, + { + "completion_length": 203.75, + "epoch": 0.00775, + "grad_norm": 0.3994467854499817, + "kl": 0.0006567926029674709, + "learning_rate": 3.8750000000000005e-07, + "loss": 0.0, + "reward": 0.9375, + "reward_std": 0.8530408143997192, + "rewards/_accuracy_reward": 0.4375, + "rewards/_format_reward": 0.5, + "step": 31 + }, + { + "completion_length": 245.25, + "epoch": 0.008, + "grad_norm": 0.3885516822338104, + "kl": 0.0007667718455195427, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 0.15625, + "reward_std": 0.4419417381286621, + "rewards/_accuracy_reward": 0.03125, + "rewards/_format_reward": 0.125, + "step": 32 + }, + { + "completion_length": 142.5, + "epoch": 0.00825, + "grad_norm": 0.5926032066345215, + "kl": 0.0007175234495662153, + "learning_rate": 4.125000000000001e-07, + "loss": 0.0, + "reward": 0.78125, + "reward_std": 0.6469364762306213, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.625, + "step": 33 + }, + { + "completion_length": 256.0, + "epoch": 0.0085, + "grad_norm": 0.31638824939727783, + "kl": 0.0005166275659576058, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 34 + }, + { + "completion_length": 135.75, + "epoch": 0.00875, + "grad_norm": 0.48894402384757996, + "kl": 0.0006614525336772203, + "learning_rate": 4.375e-07, + "loss": 0.0, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 35 + }, + { + "completion_length": 199.875, + "epoch": 0.009, + "grad_norm": 0.35523107647895813, + "kl": 0.0007132225437089801, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 1.256250023841858, + "reward_std": 1.0265884399414062, + "rewards/_accuracy_reward": 0.6312500238418579, + "rewards/_format_reward": 0.625, + "step": 36 + }, + { + "completion_length": 231.25, + "epoch": 0.00925, + "grad_norm": 0.35712504386901855, + "kl": 0.0005631354288198054, + "learning_rate": 4.625e-07, + "loss": 0.0, + "reward": 0.34375, + "reward_std": 0.5659615993499756, + "rewards/_accuracy_reward": 0.09375, + "rewards/_format_reward": 0.25, + "step": 37 + }, + { + "completion_length": 128.75, + "epoch": 0.0095, + "grad_norm": 0.48980286717414856, + "kl": 0.0007973555475473404, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 1.15625, + "reward_std": 0.9904679656028748, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 0.625, + "step": 38 + }, + { + "completion_length": 119.875, + "epoch": 0.00975, + "grad_norm": 0.4735913872718811, + "kl": 0.0006368064787238836, + "learning_rate": 4.875000000000001e-07, + "loss": 0.0, + "reward": 0.9375, + "reward_std": 0.5786375403404236, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.75, + "step": 39 + }, + { + "completion_length": 238.25, + "epoch": 0.01, + "grad_norm": 0.0012446728069335222, + "kl": 0.0006722843972966075, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.0, + "rewards/_format_reward": 0.0, + "step": 40 + }, + { + "completion_length": 175.625, + "epoch": 0.01025, + "grad_norm": 0.403473436832428, + "kl": 0.0006806729943491518, + "learning_rate": 5.125e-07, + "loss": 0.0, + "reward": 1.225000023841858, + "reward_std": 0.782395601272583, + "rewards/_accuracy_reward": 0.4749999940395355, + "rewards/_format_reward": 0.75, + "step": 41 + }, + { + "completion_length": 231.875, + "epoch": 0.0105, + "grad_norm": 0.49770158529281616, + "kl": 0.0007452387944795191, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 0.3125, + "reward_std": 0.5786375403404236, + "rewards/_accuracy_reward": 0.0625, + "rewards/_format_reward": 0.25, + "step": 42 + }, + { + "completion_length": 153.375, + "epoch": 0.01075, + "grad_norm": 0.4276140332221985, + "kl": 0.0006472233217209578, + "learning_rate": 5.375e-07, + "loss": 0.0, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 43 + }, + { + "completion_length": 253.125, + "epoch": 0.011, + "grad_norm": 0.3752562701702118, + "kl": 0.0006933521945029497, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 44 + }, + { + "completion_length": 238.0, + "epoch": 0.01125, + "grad_norm": 0.36442264914512634, + "kl": 0.0006443914026021957, + "learning_rate": 5.625e-07, + "loss": 0.0, + "reward": 0.26374998688697815, + "reward_std": 0.4852962791919708, + "rewards/_accuracy_reward": 0.013749999925494194, + "rewards/_format_reward": 0.25, + "step": 45 + }, + { + "completion_length": 245.25, + "epoch": 0.0115, + "grad_norm": 0.3032134473323822, + "kl": 0.0007067061378620565, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 46 + }, + { + "completion_length": 244.0, + "epoch": 0.01175, + "grad_norm": 0.29296252131462097, + "kl": 0.0005294690490700305, + "learning_rate": 5.875e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 47 + }, + { + "completion_length": 217.875, + "epoch": 0.012, + "grad_norm": 0.452722430229187, + "kl": 0.0006610968266613781, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.6312500238418579, + "reward_std": 0.9192146062850952, + "rewards/_accuracy_reward": 0.2562499940395355, + "rewards/_format_reward": 0.375, + "step": 48 + }, + { + "completion_length": 212.75, + "epoch": 0.01225, + "grad_norm": 0.3818938136100769, + "kl": 0.000738343340344727, + "learning_rate": 6.125000000000001e-07, + "loss": 0.0, + "reward": 0.59375, + "reward_std": 0.7898632287979126, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.375, + "step": 49 + }, + { + "completion_length": 253.75, + "epoch": 0.0125, + "grad_norm": 0.3251837491989136, + "kl": 0.0006078524165786803, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 0.13124999403953552, + "reward_std": 0.3712310194969177, + "rewards/_accuracy_reward": 0.0062500000931322575, + "rewards/_format_reward": 0.125, + "step": 50 + }, + { + "completion_length": 243.0, + "epoch": 0.01275, + "grad_norm": 0.5190939903259277, + "kl": 0.0007083836244419217, + "learning_rate": 6.375e-07, + "loss": 0.0, + "reward": 0.375, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.125, + "step": 51 + }, + { + "completion_length": 238.875, + "epoch": 0.013, + "grad_norm": 0.28036755323410034, + "kl": 0.0006517173023894429, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.14374998211860657, + "reward_std": 0.3668762743473053, + "rewards/_accuracy_reward": 0.01875000074505806, + "rewards/_format_reward": 0.125, + "step": 52 + }, + { + "completion_length": 246.75, + "epoch": 0.01325, + "grad_norm": 0.35812488198280334, + "kl": 0.0008360664360225201, + "learning_rate": 6.625000000000001e-07, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.0, + "step": 53 + }, + { + "completion_length": 224.125, + "epoch": 0.0135, + "grad_norm": 0.6414341926574707, + "kl": 0.0006303292466327548, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 0.13124999403953552, + "reward_std": 0.35146379470825195, + "rewards/_accuracy_reward": 0.13124999403953552, + "rewards/_format_reward": 0.0, + "step": 54 + }, + { + "completion_length": 179.25, + "epoch": 0.01375, + "grad_norm": 0.48622655868530273, + "kl": 0.0005767670809291303, + "learning_rate": 6.875000000000001e-07, + "loss": 0.0, + "reward": 1.0625, + "reward_std": 0.933025062084198, + "rewards/_accuracy_reward": 0.4375, + "rewards/_format_reward": 0.625, + "step": 55 + }, + { + "completion_length": 249.875, + "epoch": 0.014, + "grad_norm": 0.3496423661708832, + "kl": 0.0007510894211009145, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 56 + }, + { + "completion_length": 203.625, + "epoch": 0.01425, + "grad_norm": 0.4185102880001068, + "kl": 0.0006905872724018991, + "learning_rate": 7.125e-07, + "loss": 0.0, + "reward": 0.4437499940395355, + "reward_std": 0.6155354380607605, + "rewards/_accuracy_reward": 0.06875000149011612, + "rewards/_format_reward": 0.375, + "step": 57 + }, + { + "completion_length": 168.125, + "epoch": 0.0145, + "grad_norm": 0.4493039846420288, + "kl": 0.0008549755439162254, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.0625, + "reward_std": 0.933025062084198, + "rewards/_accuracy_reward": 0.4375, + "rewards/_format_reward": 0.625, + "step": 58 + }, + { + "completion_length": 191.375, + "epoch": 0.01475, + "grad_norm": 0.47024667263031006, + "kl": 0.0006958736339583993, + "learning_rate": 7.375e-07, + "loss": 0.0, + "reward": 0.9375, + "reward_std": 0.8530408143997192, + "rewards/_accuracy_reward": 0.4375, + "rewards/_format_reward": 0.5, + "step": 59 + }, + { + "completion_length": 221.875, + "epoch": 0.015, + "grad_norm": 0.38048815727233887, + "kl": 0.0005862182006239891, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 0.39374998211860657, + "reward_std": 0.5434265732765198, + "rewards/_accuracy_reward": 0.01875000074505806, + "rewards/_format_reward": 0.375, + "step": 60 + }, + { + "completion_length": 168.375, + "epoch": 0.01525, + "grad_norm": 0.4688246548175812, + "kl": 0.0007134783663786948, + "learning_rate": 7.625e-07, + "loss": 0.0, + "reward": 0.762499988079071, + "reward_std": 0.5736786127090454, + "rewards/_accuracy_reward": 0.13750000298023224, + "rewards/_format_reward": 0.625, + "step": 61 + }, + { + "completion_length": 223.375, + "epoch": 0.0155, + "grad_norm": 0.42805564403533936, + "kl": 0.0006862103473395109, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 0.15625, + "reward_std": 0.4419417381286621, + "rewards/_accuracy_reward": 0.03125, + "rewards/_format_reward": 0.125, + "step": 62 + }, + { + "completion_length": 134.5, + "epoch": 0.01575, + "grad_norm": 0.3511326014995575, + "kl": 0.0005350976716727018, + "learning_rate": 7.875000000000001e-07, + "loss": 0.0, + "reward": 1.09375, + "reward_std": 0.4419417381286621, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.875, + "step": 63 + }, + { + "completion_length": 179.5, + "epoch": 0.016, + "grad_norm": 0.3511122763156891, + "kl": 0.0006030978402122855, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.7999999523162842, + "reward_std": 0.7319250702857971, + "rewards/_accuracy_reward": 0.17500001192092896, + "rewards/_format_reward": 0.625, + "step": 64 + }, + { + "completion_length": 186.625, + "epoch": 0.01625, + "grad_norm": 0.40380579233169556, + "kl": 0.0007325903279706836, + "learning_rate": 8.125000000000001e-07, + "loss": 0.0, + "reward": 0.71875, + "reward_std": 0.8066409826278687, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.5, + "step": 65 + }, + { + "completion_length": 243.75, + "epoch": 0.0165, + "grad_norm": 0.32375988364219666, + "kl": 0.0006421853322535753, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 66 + }, + { + "completion_length": 227.625, + "epoch": 0.01675, + "grad_norm": 0.320794939994812, + "kl": 0.000636743672657758, + "learning_rate": 8.375000000000001e-07, + "loss": 0.0, + "reward": 0.8125, + "reward_std": 0.9136856198310852, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 0.5, + "step": 67 + }, + { + "completion_length": 212.0, + "epoch": 0.017, + "grad_norm": 0.3829336166381836, + "kl": 0.0007118759676814079, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 0.71875, + "reward_std": 0.8066409826278687, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.5, + "step": 68 + }, + { + "completion_length": 242.0, + "epoch": 0.01725, + "grad_norm": 0.3430902361869812, + "kl": 0.0006913796532899141, + "learning_rate": 8.625e-07, + "loss": 0.0, + "reward": 0.03125, + "reward_std": 0.0883883461356163, + "rewards/_accuracy_reward": 0.03125, + "rewards/_format_reward": 0.0, + "step": 69 + }, + { + "completion_length": 238.5, + "epoch": 0.0175, + "grad_norm": 0.3244885504245758, + "kl": 0.0006061598542146385, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 70 + }, + { + "completion_length": 142.625, + "epoch": 0.01775, + "grad_norm": 0.5040633678436279, + "kl": 0.0006145286024548113, + "learning_rate": 8.875000000000001e-07, + "loss": 0.0, + "reward": 0.875, + "reward_std": 0.5175491571426392, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.625, + "step": 71 + }, + { + "completion_length": 256.0, + "epoch": 0.018, + "grad_norm": 0.0008479771786369383, + "kl": 0.00054691091645509, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.0, + "rewards/_format_reward": 0.0, + "step": 72 + }, + { + "completion_length": 203.75, + "epoch": 0.01825, + "grad_norm": 0.441141813993454, + "kl": 0.0006427373737096786, + "learning_rate": 9.125e-07, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.7647361755371094, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 0.25, + "step": 73 + }, + { + "completion_length": 213.125, + "epoch": 0.0185, + "grad_norm": 0.4041379392147064, + "kl": 0.0007986929267644882, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.2749999761581421, + "reward_std": 0.47883784770965576, + "rewards/_accuracy_reward": 0.02500000037252903, + "rewards/_format_reward": 0.25, + "step": 74 + }, + { + "completion_length": 240.625, + "epoch": 0.01875, + "grad_norm": 0.3502941429615021, + "kl": 0.0006043448811396956, + "learning_rate": 9.375000000000001e-07, + "loss": 0.0, + "reward": 0.40625, + "reward_std": 0.7784771919250488, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.25, + "step": 75 + }, + { + "completion_length": 238.75, + "epoch": 0.019, + "grad_norm": 0.36953458189964294, + "kl": 0.0006860418943688273, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 76 + }, + { + "completion_length": 234.625, + "epoch": 0.01925, + "grad_norm": 0.3768980801105499, + "kl": 0.000608474132604897, + "learning_rate": 9.625e-07, + "loss": 0.0, + "reward": 0.26249998807907104, + "reward_std": 0.48605549335479736, + "rewards/_accuracy_reward": 0.012500000186264515, + "rewards/_format_reward": 0.25, + "step": 77 + }, + { + "completion_length": 252.875, + "epoch": 0.0195, + "grad_norm": 0.3738728165626526, + "kl": 0.0006259999936446548, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 0.03125, + "reward_std": 0.0883883461356163, + "rewards/_accuracy_reward": 0.03125, + "rewards/_format_reward": 0.0, + "step": 78 + }, + { + "completion_length": 186.625, + "epoch": 0.01975, + "grad_norm": 0.3504292666912079, + "kl": 0.0006338249077089131, + "learning_rate": 9.875e-07, + "loss": 0.0, + "reward": 1.2625000476837158, + "reward_std": 0.7024192810058594, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.625, + "step": 79 + }, + { + "completion_length": 193.0, + "epoch": 0.02, + "grad_norm": 0.3745087683200836, + "kl": 0.000602383108343929, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0, + "reward": 0.96875, + "reward_std": 0.8602066040039062, + "rewards/_accuracy_reward": 0.34375, + "rewards/_format_reward": 0.625, + "step": 80 + }, + { + "completion_length": 214.0, + "epoch": 0.02025, + "grad_norm": 0.327633798122406, + "kl": 0.0005732266581617296, + "learning_rate": 1.0125e-06, + "loss": 0.0, + "reward": 0.78125, + "reward_std": 0.900768518447876, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.375, + "step": 81 + }, + { + "completion_length": 217.875, + "epoch": 0.0205, + "grad_norm": 0.43696385622024536, + "kl": 0.000646731466986239, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.512499988079071, + "reward_std": 0.765669584274292, + "rewards/_accuracy_reward": 0.13749998807907104, + "rewards/_format_reward": 0.375, + "step": 82 + }, + { + "completion_length": 189.125, + "epoch": 0.02075, + "grad_norm": 0.44965532422065735, + "kl": 0.0007412461563944817, + "learning_rate": 1.0375e-06, + "loss": 0.0, + "reward": 0.90625, + "reward_std": 0.9994418025016785, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.5, + "step": 83 + }, + { + "completion_length": 182.5, + "epoch": 0.021, + "grad_norm": 0.32483235001564026, + "kl": 0.0006356970407068729, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.375, + "reward_std": 0.6681531071662903, + "rewards/_accuracy_reward": 0.5, + "rewards/_format_reward": 0.875, + "step": 84 + }, + { + "completion_length": 218.375, + "epoch": 0.02125, + "grad_norm": 0.36401814222335815, + "kl": 0.0006372305797412992, + "learning_rate": 1.0625e-06, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.810092568397522, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.375, + "step": 85 + }, + { + "completion_length": 192.875, + "epoch": 0.0215, + "grad_norm": 0.40159401297569275, + "kl": 0.0006454067770391703, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 0.625, + "reward_std": 0.6681531071662903, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.5, + "step": 86 + }, + { + "completion_length": 199.375, + "epoch": 0.02175, + "grad_norm": 0.3774471879005432, + "kl": 0.000684591883327812, + "learning_rate": 1.0875000000000002e-06, + "loss": 0.0, + "reward": 1.0, + "reward_std": 1.0690449476242065, + "rewards/_accuracy_reward": 0.5, + "rewards/_format_reward": 0.5, + "step": 87 + }, + { + "completion_length": 235.25, + "epoch": 0.022, + "grad_norm": 0.4988541901111603, + "kl": 0.0005781830986961722, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 0.13749998807907104, + "reward_std": 0.3691205382347107, + "rewards/_accuracy_reward": 0.012500000186264515, + "rewards/_format_reward": 0.125, + "step": 88 + }, + { + "completion_length": 231.0, + "epoch": 0.02225, + "grad_norm": 0.3462050259113312, + "kl": 0.0006304323324002326, + "learning_rate": 1.1125000000000001e-06, + "loss": 0.0, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 89 + }, + { + "completion_length": 240.375, + "epoch": 0.0225, + "grad_norm": 0.366056352853775, + "kl": 0.000524615403264761, + "learning_rate": 1.125e-06, + "loss": 0.0, + "reward": 0.3125, + "reward_std": 0.5786375403404236, + "rewards/_accuracy_reward": 0.0625, + "rewards/_format_reward": 0.25, + "step": 90 + }, + { + "completion_length": 212.125, + "epoch": 0.02275, + "grad_norm": 0.38316699862480164, + "kl": 0.0006456922856159508, + "learning_rate": 1.1375000000000001e-06, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.810092568397522, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.375, + "step": 91 + }, + { + "completion_length": 173.875, + "epoch": 0.023, + "grad_norm": 0.4823635518550873, + "kl": 0.0008208313374780118, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0, + "reward": 1.2625000476837158, + "reward_std": 0.8826704621315002, + "rewards/_accuracy_reward": 0.512499988079071, + "rewards/_format_reward": 0.75, + "step": 92 + }, + { + "completion_length": 256.0, + "epoch": 0.02325, + "grad_norm": 0.0010727356420829892, + "kl": 0.0005186050548218191, + "learning_rate": 1.1625e-06, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.0, + "rewards/_format_reward": 0.0, + "step": 93 + }, + { + "completion_length": 204.375, + "epoch": 0.0235, + "grad_norm": 0.4184603691101074, + "kl": 0.0008142682490870357, + "learning_rate": 1.175e-06, + "loss": 0.0, + "reward": 1.1875, + "reward_std": 0.831843912601471, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.625, + "step": 94 + }, + { + "completion_length": 196.5, + "epoch": 0.02375, + "grad_norm": 0.465569406747818, + "kl": 0.0008174768299795687, + "learning_rate": 1.1875e-06, + "loss": 0.0, + "reward": 1.0012500286102295, + "reward_std": 0.6813943386077881, + "rewards/_accuracy_reward": 0.2512499988079071, + "rewards/_format_reward": 0.75, + "step": 95 + }, + { + "completion_length": 236.625, + "epoch": 0.024, + "grad_norm": 0.4273647367954254, + "kl": 0.0006561095942743123, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 96 + }, + { + "completion_length": 210.75, + "epoch": 0.02425, + "grad_norm": 0.4174526035785675, + "kl": 0.0006856574909761548, + "learning_rate": 1.2125e-06, + "loss": 0.0, + "reward": 0.65625, + "reward_std": 0.8957987427711487, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.25, + "step": 97 + }, + { + "completion_length": 239.875, + "epoch": 0.0245, + "grad_norm": 0.3910273611545563, + "kl": 0.0006794078508391976, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0, + "reward": 0.40625, + "reward_std": 0.7784771919250488, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.25, + "step": 98 + }, + { + "completion_length": 230.625, + "epoch": 0.02475, + "grad_norm": 0.3605254888534546, + "kl": 0.0006890887161716819, + "learning_rate": 1.2375e-06, + "loss": 0.0, + "reward": 0.2562499940395355, + "reward_std": 0.7047986388206482, + "rewards/_accuracy_reward": 0.13124999403953552, + "rewards/_format_reward": 0.125, + "step": 99 + }, + { + "completion_length": 232.5, + "epoch": 0.025, + "grad_norm": 0.374227911233902, + "kl": 0.0006080594030208886, + "learning_rate": 1.25e-06, + "loss": 0.0, + "reward": 0.26374998688697815, + "reward_std": 0.47337502241134644, + "rewards/_accuracy_reward": 0.013749999925494194, + "rewards/_format_reward": 0.25, + "step": 100 + }, + { + "completion_length": 246.125, + "epoch": 0.02525, + "grad_norm": 0.480892539024353, + "kl": 0.0007099907961674035, + "learning_rate": 1.2625000000000002e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.0, + "step": 101 + }, + { + "completion_length": 233.375, + "epoch": 0.0255, + "grad_norm": 0.34077897667884827, + "kl": 0.0006432888912968338, + "learning_rate": 1.275e-06, + "loss": 0.0, + "reward": 0.53125, + "reward_std": 0.9106267690658569, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.25, + "step": 102 + }, + { + "completion_length": 122.875, + "epoch": 0.02575, + "grad_norm": 0.460584819316864, + "kl": 0.0007099162903614342, + "learning_rate": 1.2875000000000002e-06, + "loss": 0.0, + "reward": 1.2512500286102295, + "reward_std": 0.7314454913139343, + "rewards/_accuracy_reward": 0.5012500286102295, + "rewards/_format_reward": 0.75, + "step": 103 + }, + { + "completion_length": 202.875, + "epoch": 0.026, + "grad_norm": 0.4294702410697937, + "kl": 0.0006664457614533603, + "learning_rate": 1.3e-06, + "loss": 0.0, + "reward": 0.6937500238418579, + "reward_std": 0.7907670140266418, + "rewards/_accuracy_reward": 0.19374999403953552, + "rewards/_format_reward": 0.5, + "step": 104 + }, + { + "completion_length": 177.25, + "epoch": 0.02625, + "grad_norm": 0.40910154581069946, + "kl": 0.000768951780628413, + "learning_rate": 1.3125000000000001e-06, + "loss": 0.0, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.625, + "step": 105 + }, + { + "completion_length": 146.5, + "epoch": 0.0265, + "grad_norm": 0.45140013098716736, + "kl": 0.0007151229656301439, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 106 + }, + { + "completion_length": 245.375, + "epoch": 0.02675, + "grad_norm": 0.35899364948272705, + "kl": 0.000733047432731837, + "learning_rate": 1.3375000000000001e-06, + "loss": 0.0, + "reward": 0.75, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.375, + "rewards/_format_reward": 0.375, + "step": 107 + }, + { + "completion_length": 231.875, + "epoch": 0.027, + "grad_norm": 0.3983488082885742, + "kl": 0.0006859501008875668, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0, + "reward": 0.375, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.125, + "step": 108 + }, + { + "completion_length": 236.5, + "epoch": 0.02725, + "grad_norm": 0.3864477574825287, + "kl": 0.0007214280776679516, + "learning_rate": 1.3625000000000003e-06, + "loss": 0.0, + "reward": 0.29374998807907104, + "reward_std": 0.5314652919769287, + "rewards/_accuracy_reward": 0.04374999925494194, + "rewards/_format_reward": 0.25, + "step": 109 + }, + { + "completion_length": 105.75, + "epoch": 0.0275, + "grad_norm": 0.48207709193229675, + "kl": 0.0006837123655714095, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0, + "reward": 1.162500023841858, + "reward_std": 0.548211395740509, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 0.875, + "step": 110 + }, + { + "completion_length": 242.5, + "epoch": 0.02775, + "grad_norm": 0.3625413775444031, + "kl": 0.0006246070261113346, + "learning_rate": 1.3875000000000003e-06, + "loss": 0.0, + "reward": 0.26249998807907104, + "reward_std": 0.48605549335479736, + "rewards/_accuracy_reward": 0.012500000186264515, + "rewards/_format_reward": 0.25, + "step": 111 + }, + { + "completion_length": 207.875, + "epoch": 0.028, + "grad_norm": 0.40663740038871765, + "kl": 0.0007137281936593354, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0, + "reward": 0.90625, + "reward_std": 0.9994418025016785, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.5, + "step": 112 + }, + { + "completion_length": 236.875, + "epoch": 0.02825, + "grad_norm": 0.4253649413585663, + "kl": 0.0006713285110890865, + "learning_rate": 1.4125e-06, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 113 + }, + { + "completion_length": 178.125, + "epoch": 0.0285, + "grad_norm": 0.390341579914093, + "kl": 0.0006668589194305241, + "learning_rate": 1.425e-06, + "loss": 0.0, + "reward": 1.193750023841858, + "reward_std": 0.8304204940795898, + "rewards/_accuracy_reward": 0.4437499940395355, + "rewards/_format_reward": 0.75, + "step": 114 + }, + { + "completion_length": 218.875, + "epoch": 0.02875, + "grad_norm": 0.472661554813385, + "kl": 0.0006868990603834391, + "learning_rate": 1.4375e-06, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 115 + }, + { + "completion_length": 232.875, + "epoch": 0.029, + "grad_norm": 0.34683483839035034, + "kl": 0.000628342037089169, + "learning_rate": 1.45e-06, + "loss": 0.0, + "reward": 0.5325000286102295, + "reward_std": 0.7850523591041565, + "rewards/_accuracy_reward": 0.1574999988079071, + "rewards/_format_reward": 0.375, + "step": 116 + }, + { + "completion_length": 236.625, + "epoch": 0.02925, + "grad_norm": 0.3442273736000061, + "kl": 0.000823180191218853, + "learning_rate": 1.4625e-06, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 117 + }, + { + "completion_length": 238.375, + "epoch": 0.0295, + "grad_norm": 0.35203394293785095, + "kl": 0.0007420446490868926, + "learning_rate": 1.475e-06, + "loss": 0.0, + "reward": 0.6312500238418579, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.5062500238418579, + "rewards/_format_reward": 0.125, + "step": 118 + }, + { + "completion_length": 236.75, + "epoch": 0.02975, + "grad_norm": 0.411632239818573, + "kl": 0.0007851931732147932, + "learning_rate": 1.4875000000000002e-06, + "loss": 0.0, + "reward": 0.40625, + "reward_std": 0.7784771919250488, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.25, + "step": 119 + }, + { + "completion_length": 230.0, + "epoch": 0.03, + "grad_norm": 0.35817092657089233, + "kl": 0.0006097652949392796, + "learning_rate": 1.5e-06, + "loss": 0.0, + "reward": 0.8125, + "reward_std": 0.9136856198310852, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 0.5, + "step": 120 + }, + { + "completion_length": 226.125, + "epoch": 0.03025, + "grad_norm": 0.36551421880722046, + "kl": 0.0007124023977667093, + "learning_rate": 1.5125000000000001e-06, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.7647361755371094, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 0.25, + "step": 121 + }, + { + "completion_length": 168.875, + "epoch": 0.0305, + "grad_norm": 0.4331101179122925, + "kl": 0.0007970595033839345, + "learning_rate": 1.525e-06, + "loss": 0.0, + "reward": 1.3875000476837158, + "reward_std": 0.3879893124103546, + "rewards/_accuracy_reward": 0.38749998807907104, + "rewards/_format_reward": 1.0, + "step": 122 + }, + { + "completion_length": 241.25, + "epoch": 0.03075, + "grad_norm": 0.30984166264533997, + "kl": 0.0006017238483764231, + "learning_rate": 1.5375e-06, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 123 + }, + { + "completion_length": 215.625, + "epoch": 0.031, + "grad_norm": 0.4169147312641144, + "kl": 0.000758463516831398, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.0, + "reward": 0.2824999988079071, + "reward_std": 0.5242885947227478, + "rewards/_accuracy_reward": 0.1574999988079071, + "rewards/_format_reward": 0.125, + "step": 124 + }, + { + "completion_length": 225.0, + "epoch": 0.03125, + "grad_norm": 0.28472763299942017, + "kl": 0.0007155483472160995, + "learning_rate": 1.5625e-06, + "loss": 0.0, + "reward": 0.6312500238418579, + "reward_std": 0.9114108085632324, + "rewards/_accuracy_reward": 0.3812499940395355, + "rewards/_format_reward": 0.25, + "step": 125 + }, + { + "completion_length": 192.75, + "epoch": 0.0315, + "grad_norm": 0.4392932057380676, + "kl": 0.0008345048408955336, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0, + "reward": 0.6000000238418579, + "reward_std": 0.6447590589523315, + "rewards/_accuracy_reward": 0.10000000149011612, + "rewards/_format_reward": 0.5, + "step": 126 + }, + { + "completion_length": 193.0, + "epoch": 0.03175, + "grad_norm": 0.3476108908653259, + "kl": 0.000728312530554831, + "learning_rate": 1.5875e-06, + "loss": 0.0, + "reward": 0.9124999642372131, + "reward_std": 0.6384971141815186, + "rewards/_accuracy_reward": 0.16249999403953552, + "rewards/_format_reward": 0.75, + "step": 127 + }, + { + "completion_length": 221.875, + "epoch": 0.032, + "grad_norm": 0.42320966720581055, + "kl": 0.0008315700688399374, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.810092568397522, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.375, + "step": 128 + }, + { + "completion_length": 189.875, + "epoch": 0.03225, + "grad_norm": 0.3862830102443695, + "kl": 0.0008328621624968946, + "learning_rate": 1.6125000000000002e-06, + "loss": 0.0, + "reward": 0.875, + "reward_std": 0.7196229100227356, + "rewards/_accuracy_reward": 0.375, + "rewards/_format_reward": 0.5, + "step": 129 + }, + { + "completion_length": 237.875, + "epoch": 0.0325, + "grad_norm": 0.4434264004230499, + "kl": 0.000727821490727365, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0, + "reward": 0.40625, + "reward_std": 0.7784771919250488, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.25, + "step": 130 + }, + { + "completion_length": 173.125, + "epoch": 0.03275, + "grad_norm": 0.6541438102722168, + "kl": 0.0010414267890155315, + "learning_rate": 1.6375000000000002e-06, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.7647361755371094, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 0.25, + "step": 131 + }, + { + "completion_length": 140.0, + "epoch": 0.033, + "grad_norm": 0.4342222511768341, + "kl": 0.0008442209218628705, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0, + "reward": 1.15625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.75, + "step": 132 + }, + { + "completion_length": 155.375, + "epoch": 0.03325, + "grad_norm": 0.4700578451156616, + "kl": 0.0009654579916968942, + "learning_rate": 1.6625000000000002e-06, + "loss": 0.0, + "reward": 1.1875, + "reward_std": 0.6373774409294128, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.5, + "step": 133 + }, + { + "completion_length": 235.125, + "epoch": 0.0335, + "grad_norm": 0.4328174889087677, + "kl": 0.0007693137740716338, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0, + "reward": 0.3812499940395355, + "reward_std": 0.75020831823349, + "rewards/_accuracy_reward": 0.13124999403953552, + "rewards/_format_reward": 0.25, + "step": 134 + }, + { + "completion_length": 256.0, + "epoch": 0.03375, + "grad_norm": 0.0011643688194453716, + "kl": 0.0006938659935258329, + "learning_rate": 1.6875000000000001e-06, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.0, + "rewards/_format_reward": 0.0, + "step": 135 + }, + { + "completion_length": 176.25, + "epoch": 0.034, + "grad_norm": 0.46144965291023254, + "kl": 0.0012229140847921371, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0, + "reward": 0.42499998211860657, + "reward_std": 0.5763183832168579, + "rewards/_accuracy_reward": 0.05000000447034836, + "rewards/_format_reward": 0.375, + "step": 136 + }, + { + "completion_length": 108.25, + "epoch": 0.03425, + "grad_norm": 0.641877293586731, + "kl": 0.000889874529093504, + "learning_rate": 1.7125000000000003e-06, + "loss": 0.0, + "reward": 1.1375000476837158, + "reward_std": 0.8327021598815918, + "rewards/_accuracy_reward": 0.512499988079071, + "rewards/_format_reward": 0.625, + "step": 137 + }, + { + "completion_length": 243.875, + "epoch": 0.0345, + "grad_norm": 0.4593142569065094, + "kl": 0.000851454387884587, + "learning_rate": 1.725e-06, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 138 + }, + { + "completion_length": 166.75, + "epoch": 0.03475, + "grad_norm": 0.37833383679389954, + "kl": 0.000998564064502716, + "learning_rate": 1.7375e-06, + "loss": 0.0, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.625, + "step": 139 + }, + { + "completion_length": 155.5, + "epoch": 0.035, + "grad_norm": 0.48516976833343506, + "kl": 0.0008888572920113802, + "learning_rate": 1.75e-06, + "loss": 0.0, + "reward": 0.9812500476837158, + "reward_std": 0.6750330924987793, + "rewards/_accuracy_reward": 0.23124998807907104, + "rewards/_format_reward": 0.75, + "step": 140 + }, + { + "completion_length": 140.0, + "epoch": 0.03525, + "grad_norm": 0.4754278063774109, + "kl": 0.001002427306957543, + "learning_rate": 1.7625e-06, + "loss": 0.0, + "reward": 1.34375, + "reward_std": 0.8230767846107483, + "rewards/_accuracy_reward": 0.59375, + "rewards/_format_reward": 0.75, + "step": 141 + }, + { + "completion_length": 221.875, + "epoch": 0.0355, + "grad_norm": 0.5134493708610535, + "kl": 0.0009355404181405902, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 142 + }, + { + "completion_length": 204.0, + "epoch": 0.03575, + "grad_norm": 0.41457489132881165, + "kl": 0.001152246375568211, + "learning_rate": 1.7875e-06, + "loss": 0.0, + "reward": 0.2562499940395355, + "reward_std": 0.7047985792160034, + "rewards/_accuracy_reward": 0.13124999403953552, + "rewards/_format_reward": 0.125, + "step": 143 + }, + { + "completion_length": 241.75, + "epoch": 0.036, + "grad_norm": 0.37025943398475647, + "kl": 0.0008789349813014269, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 144 + }, + { + "completion_length": 145.25, + "epoch": 0.03625, + "grad_norm": 0.6227016448974609, + "kl": 0.0012156821321696043, + "learning_rate": 1.8125e-06, + "loss": 0.0, + "reward": 0.9387500286102295, + "reward_std": 0.8531527519226074, + "rewards/_accuracy_reward": 0.3137499988079071, + "rewards/_format_reward": 0.625, + "step": 145 + }, + { + "completion_length": 194.25, + "epoch": 0.0365, + "grad_norm": 0.40095922350883484, + "kl": 0.0010887724347412586, + "learning_rate": 1.825e-06, + "loss": 0.0, + "reward": 1.193750023841858, + "reward_std": 0.8304204940795898, + "rewards/_accuracy_reward": 0.4437499940395355, + "rewards/_format_reward": 0.75, + "step": 146 + }, + { + "completion_length": 210.5, + "epoch": 0.03675, + "grad_norm": 0.3590393662452698, + "kl": 0.001111085875891149, + "learning_rate": 1.8375000000000002e-06, + "loss": 0.0, + "reward": 0.6312500238418579, + "reward_std": 0.6169843077659607, + "rewards/_accuracy_reward": 0.13124999403953552, + "rewards/_format_reward": 0.5, + "step": 147 + }, + { + "completion_length": 207.25, + "epoch": 0.037, + "grad_norm": 0.46096640825271606, + "kl": 0.0010449312394484878, + "learning_rate": 1.85e-06, + "loss": 0.0, + "reward": 0.8887499570846558, + "reward_std": 0.8371795415878296, + "rewards/_accuracy_reward": 0.26374998688697815, + "rewards/_format_reward": 0.625, + "step": 148 + }, + { + "completion_length": 245.125, + "epoch": 0.03725, + "grad_norm": 0.3660503327846527, + "kl": 0.0008743983926251531, + "learning_rate": 1.8625000000000002e-06, + "loss": 0.0, + "reward": 0.3812499940395355, + "reward_std": 0.7502082586288452, + "rewards/_accuracy_reward": 0.13124999403953552, + "rewards/_format_reward": 0.25, + "step": 149 + }, + { + "completion_length": 204.75, + "epoch": 0.0375, + "grad_norm": 0.3448229730129242, + "kl": 0.0009303970145992935, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0, + "reward": 1.1324999332427979, + "reward_std": 0.549304723739624, + "rewards/_accuracy_reward": 0.2574999928474426, + "rewards/_format_reward": 0.875, + "step": 150 + }, + { + "completion_length": 201.125, + "epoch": 0.03775, + "grad_norm": 0.4772341847419739, + "kl": 0.0015494409017264843, + "learning_rate": 1.8875000000000001e-06, + "loss": 0.0001, + "reward": 0.3812499940395355, + "reward_std": 0.75020831823349, + "rewards/_accuracy_reward": 0.13124999403953552, + "rewards/_format_reward": 0.25, + "step": 151 + }, + { + "completion_length": 168.5, + "epoch": 0.038, + "grad_norm": 0.5399798154830933, + "kl": 0.001348810619674623, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0001, + "reward": 1.1437499523162842, + "reward_std": 0.8317097425460815, + "rewards/_accuracy_reward": 0.39374998211860657, + "rewards/_format_reward": 0.75, + "step": 152 + }, + { + "completion_length": 223.625, + "epoch": 0.03825, + "grad_norm": 0.36858677864074707, + "kl": 0.0011914662318304181, + "learning_rate": 1.9125000000000003e-06, + "loss": 0.0, + "reward": 0.8812500238418579, + "reward_std": 0.9920892715454102, + "rewards/_accuracy_reward": 0.3812499940395355, + "rewards/_format_reward": 0.5, + "step": 153 + }, + { + "completion_length": 179.0, + "epoch": 0.0385, + "grad_norm": 0.4605531394481659, + "kl": 0.0013787942007184029, + "learning_rate": 1.925e-06, + "loss": 0.0001, + "reward": 0.90625, + "reward_std": 0.9994418025016785, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.5, + "step": 154 + }, + { + "completion_length": 252.25, + "epoch": 0.03875, + "grad_norm": 0.4210733473300934, + "kl": 0.0009038643911480904, + "learning_rate": 1.9375e-06, + "loss": 0.0, + "reward": 0.15625, + "reward_std": 0.4419417381286621, + "rewards/_accuracy_reward": 0.03125, + "rewards/_format_reward": 0.125, + "step": 155 + }, + { + "completion_length": 246.0, + "epoch": 0.039, + "grad_norm": 0.40100812911987305, + "kl": 0.001074227737262845, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0, + "reward": 0.3762499988079071, + "reward_std": 0.7452312707901001, + "rewards/_accuracy_reward": 0.1262499988079071, + "rewards/_format_reward": 0.25, + "step": 156 + }, + { + "completion_length": 215.875, + "epoch": 0.03925, + "grad_norm": 0.39963439106941223, + "kl": 0.0015569372335448861, + "learning_rate": 1.9625000000000003e-06, + "loss": 0.0001, + "reward": 0.34375, + "reward_std": 0.5659615993499756, + "rewards/_accuracy_reward": 0.09375, + "rewards/_format_reward": 0.25, + "step": 157 + }, + { + "completion_length": 251.625, + "epoch": 0.0395, + "grad_norm": 0.34965765476226807, + "kl": 0.0011444491101428866, + "learning_rate": 1.975e-06, + "loss": 0.0, + "reward": 0.13124999403953552, + "reward_std": 0.3712310194969177, + "rewards/_accuracy_reward": 0.0062500000931322575, + "rewards/_format_reward": 0.125, + "step": 158 + }, + { + "completion_length": 243.75, + "epoch": 0.03975, + "grad_norm": 0.36649003624916077, + "kl": 0.0010651213815435767, + "learning_rate": 1.9875000000000005e-06, + "loss": 0.0, + "reward": 0.5625, + "reward_std": 0.810092568397522, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.375, + "step": 159 + }, + { + "completion_length": 199.125, + "epoch": 0.04, + "grad_norm": 0.4516124725341797, + "kl": 0.0014434423064813018, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0001, + "reward": 0.824999988079071, + "reward_std": 0.7449832558631897, + "rewards/_accuracy_reward": 0.19999998807907104, + "rewards/_format_reward": 0.625, + "step": 160 + }, + { + "completion_length": 207.875, + "epoch": 0.04025, + "grad_norm": 0.35151946544647217, + "kl": 0.001188786351121962, + "learning_rate": 2.0125000000000002e-06, + "loss": 0.0, + "reward": 0.5449999570846558, + "reward_std": 0.5869290232658386, + "rewards/_accuracy_reward": 0.044999998062849045, + "rewards/_format_reward": 0.5, + "step": 161 + }, + { + "completion_length": 192.625, + "epoch": 0.0405, + "grad_norm": 0.5956340432167053, + "kl": 0.001869518426246941, + "learning_rate": 2.025e-06, + "loss": 0.0001, + "reward": 0.5187499523162842, + "reward_std": 0.7610788941383362, + "rewards/_accuracy_reward": 0.14374998211860657, + "rewards/_format_reward": 0.375, + "step": 162 + }, + { + "completion_length": 241.125, + "epoch": 0.04075, + "grad_norm": 0.35935690999031067, + "kl": 0.0010445680236443877, + "learning_rate": 2.0375e-06, + "loss": 0.0, + "reward": 0.40625, + "reward_std": 0.7784771919250488, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.25, + "step": 163 + }, + { + "completion_length": 166.25, + "epoch": 0.041, + "grad_norm": 0.36909234523773193, + "kl": 0.0019193933112546802, + "learning_rate": 2.05e-06, + "loss": 0.0001, + "reward": 1.181249976158142, + "reward_std": 0.6335486769676208, + "rewards/_accuracy_reward": 0.3062499761581421, + "rewards/_format_reward": 0.875, + "step": 164 + }, + { + "completion_length": 228.75, + "epoch": 0.04125, + "grad_norm": 0.402389794588089, + "kl": 0.0016254698857665062, + "learning_rate": 2.0625e-06, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 165 + }, + { + "completion_length": 113.625, + "epoch": 0.0415, + "grad_norm": 0.43517887592315674, + "kl": 0.0022651588078588247, + "learning_rate": 2.075e-06, + "loss": 0.0001, + "reward": 1.2937500476837158, + "reward_std": 0.29932963848114014, + "rewards/_accuracy_reward": 0.29374998807907104, + "rewards/_format_reward": 1.0, + "step": 166 + }, + { + "completion_length": 184.25, + "epoch": 0.04175, + "grad_norm": 0.4854840040206909, + "kl": 0.00202546757645905, + "learning_rate": 2.0875e-06, + "loss": 0.0001, + "reward": 0.9375, + "reward_std": 0.6028207540512085, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.75, + "step": 167 + }, + { + "completion_length": 173.0, + "epoch": 0.042, + "grad_norm": 0.36054983735084534, + "kl": 0.0024099252186715603, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0001, + "reward": 1.28125, + "reward_std": 0.6187184453010559, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.875, + "step": 168 + }, + { + "completion_length": 214.625, + "epoch": 0.04225, + "grad_norm": 0.4301230311393738, + "kl": 0.0015324982814490795, + "learning_rate": 2.1125e-06, + "loss": 0.0001, + "reward": 0.90625, + "reward_std": 0.9994418025016785, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.5, + "step": 169 + }, + { + "completion_length": 163.25, + "epoch": 0.0425, + "grad_norm": 0.4773411154747009, + "kl": 0.0024503618478775024, + "learning_rate": 2.125e-06, + "loss": 0.0001, + "reward": 1.0499999523162842, + "reward_std": 0.7606388330459595, + "rewards/_accuracy_reward": 0.29999998211860657, + "rewards/_format_reward": 0.75, + "step": 170 + }, + { + "completion_length": 201.0, + "epoch": 0.04275, + "grad_norm": 0.3986191153526306, + "kl": 0.002186427591368556, + "learning_rate": 2.1375000000000003e-06, + "loss": 0.0001, + "reward": 0.6312500238418579, + "reward_std": 0.9192145466804504, + "rewards/_accuracy_reward": 0.2562499940395355, + "rewards/_format_reward": 0.375, + "step": 171 + }, + { + "completion_length": 248.875, + "epoch": 0.043, + "grad_norm": 0.38168638944625854, + "kl": 0.002373702824115753, + "learning_rate": 2.15e-06, + "loss": 0.0001, + "reward": 0.3824999928474426, + "reward_std": 0.7494902610778809, + "rewards/_accuracy_reward": 0.13249999284744263, + "rewards/_format_reward": 0.25, + "step": 172 + }, + { + "completion_length": 182.875, + "epoch": 0.04325, + "grad_norm": 0.5253975987434387, + "kl": 0.0032906224951148033, + "learning_rate": 2.1625e-06, + "loss": 0.0001, + "reward": 1.037500023841858, + "reward_std": 0.9299577474594116, + "rewards/_accuracy_reward": 0.4124999940395355, + "rewards/_format_reward": 0.625, + "step": 173 + }, + { + "completion_length": 143.75, + "epoch": 0.0435, + "grad_norm": 0.57611483335495, + "kl": 0.0019308909540995955, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0001, + "reward": 1.756250023841858, + "reward_std": 0.6894291639328003, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 174 + }, + { + "completion_length": 254.5, + "epoch": 0.04375, + "grad_norm": 0.34048524498939514, + "kl": 0.0014960013795644045, + "learning_rate": 2.1875000000000002e-06, + "loss": 0.0001, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/_accuracy_reward": 0.0062500000931322575, + "rewards/_format_reward": 0.0, + "step": 175 + }, + { + "completion_length": 230.875, + "epoch": 0.044, + "grad_norm": 0.39282166957855225, + "kl": 0.0017745784716680646, + "learning_rate": 2.2e-06, + "loss": 0.0001, + "reward": 0.8812500238418579, + "reward_std": 0.9920892715454102, + "rewards/_accuracy_reward": 0.3812499940395355, + "rewards/_format_reward": 0.5, + "step": 176 + }, + { + "completion_length": 95.125, + "epoch": 0.04425, + "grad_norm": 0.64809650182724, + "kl": 0.0027632713317871094, + "learning_rate": 2.2125e-06, + "loss": 0.0001, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.75, + "step": 177 + }, + { + "completion_length": 235.375, + "epoch": 0.0445, + "grad_norm": 0.4109288454055786, + "kl": 0.0018161768093705177, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0001, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 178 + }, + { + "completion_length": 239.5, + "epoch": 0.04475, + "grad_norm": 0.3848976492881775, + "kl": 0.0016923128860071301, + "learning_rate": 2.2375e-06, + "loss": 0.0001, + "reward": 0.5249999761581421, + "reward_std": 0.5612486004829407, + "rewards/_accuracy_reward": 0.02500000037252903, + "rewards/_format_reward": 0.5, + "step": 179 + }, + { + "completion_length": 234.875, + "epoch": 0.045, + "grad_norm": 0.4291512072086334, + "kl": 0.001589511870406568, + "learning_rate": 2.25e-06, + "loss": 0.0001, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 180 + }, + { + "completion_length": 233.625, + "epoch": 0.04525, + "grad_norm": 0.5065072774887085, + "kl": 0.0018506099004298449, + "learning_rate": 2.2625000000000004e-06, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 181 + }, + { + "completion_length": 214.5, + "epoch": 0.0455, + "grad_norm": 0.4389038383960724, + "kl": 0.0023762795608490705, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0001, + "reward": 0.39374998211860657, + "reward_std": 0.5434265732765198, + "rewards/_accuracy_reward": 0.01875000074505806, + "rewards/_format_reward": 0.375, + "step": 182 + }, + { + "completion_length": 210.0, + "epoch": 0.04575, + "grad_norm": 0.45881783962249756, + "kl": 0.001946362666785717, + "learning_rate": 2.2875e-06, + "loss": 0.0001, + "reward": 0.5687500238418579, + "reward_std": 0.7690148949623108, + "rewards/_accuracy_reward": 0.19374999403953552, + "rewards/_format_reward": 0.375, + "step": 183 + }, + { + "completion_length": 219.5, + "epoch": 0.046, + "grad_norm": 0.4343101978302002, + "kl": 0.0028733538929373026, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0001, + "reward": 0.625, + "reward_std": 0.9161254167556763, + "rewards/_accuracy_reward": 0.375, + "rewards/_format_reward": 0.25, + "step": 184 + }, + { + "completion_length": 220.125, + "epoch": 0.04625, + "grad_norm": 0.4084475338459015, + "kl": 0.00223284843377769, + "learning_rate": 2.3125000000000003e-06, + "loss": 0.0001, + "reward": 0.7200000286102295, + "reward_std": 0.805374801158905, + "rewards/_accuracy_reward": 0.2199999988079071, + "rewards/_format_reward": 0.5, + "step": 185 + }, + { + "completion_length": 171.625, + "epoch": 0.0465, + "grad_norm": 0.5016433000564575, + "kl": 0.0028451047837734222, + "learning_rate": 2.325e-06, + "loss": 0.0001, + "reward": 1.037500023841858, + "reward_std": 0.9299578070640564, + "rewards/_accuracy_reward": 0.4124999940395355, + "rewards/_format_reward": 0.625, + "step": 186 + }, + { + "completion_length": 168.125, + "epoch": 0.04675, + "grad_norm": 0.3974522650241852, + "kl": 0.0034177624620497227, + "learning_rate": 2.3375000000000005e-06, + "loss": 0.0001, + "reward": 1.2937500476837158, + "reward_std": 0.6945900917053223, + "rewards/_accuracy_reward": 0.543749988079071, + "rewards/_format_reward": 0.75, + "step": 187 + }, + { + "completion_length": 249.5, + "epoch": 0.047, + "grad_norm": 0.4274694621562958, + "kl": 0.001807468244805932, + "learning_rate": 2.35e-06, + "loss": 0.0001, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 188 + }, + { + "completion_length": 244.5, + "epoch": 0.04725, + "grad_norm": 0.006730486173182726, + "kl": 0.0024024751037359238, + "learning_rate": 2.3625000000000003e-06, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.0, + "rewards/_format_reward": 0.0, + "step": 189 + }, + { + "completion_length": 235.875, + "epoch": 0.0475, + "grad_norm": 0.4219696521759033, + "kl": 0.002536715939640999, + "learning_rate": 2.375e-06, + "loss": 0.0001, + "reward": 0.6000000238418579, + "reward_std": 0.6447591185569763, + "rewards/_accuracy_reward": 0.10000000149011612, + "rewards/_format_reward": 0.5, + "step": 190 + }, + { + "completion_length": 202.0, + "epoch": 0.04775, + "grad_norm": 0.4399944245815277, + "kl": 0.0024324413388967514, + "learning_rate": 2.3875e-06, + "loss": 0.0001, + "reward": 1.0, + "reward_std": 0.8237544298171997, + "rewards/_accuracy_reward": 0.375, + "rewards/_format_reward": 0.625, + "step": 191 + }, + { + "completion_length": 244.0, + "epoch": 0.048, + "grad_norm": 0.3966406285762787, + "kl": 0.002154730260372162, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0001, + "reward": 0.40625, + "reward_std": 0.7784771919250488, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.25, + "step": 192 + }, + { + "completion_length": 214.625, + "epoch": 0.04825, + "grad_norm": 0.43435680866241455, + "kl": 0.002745242090895772, + "learning_rate": 2.4125e-06, + "loss": 0.0001, + "reward": 1.1875, + "reward_std": 0.831843912601471, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.625, + "step": 193 + }, + { + "completion_length": 171.625, + "epoch": 0.0485, + "grad_norm": 0.38358110189437866, + "kl": 0.0024988814257085323, + "learning_rate": 2.425e-06, + "loss": 0.0001, + "reward": 1.2312500476837158, + "reward_std": 0.6284547448158264, + "rewards/_accuracy_reward": 0.35624998807907104, + "rewards/_format_reward": 0.875, + "step": 194 + }, + { + "completion_length": 189.875, + "epoch": 0.04875, + "grad_norm": 0.44296523928642273, + "kl": 0.0033659208565950394, + "learning_rate": 2.4375e-06, + "loss": 0.0001, + "reward": 1.068750023841858, + "reward_std": 0.9250240921974182, + "rewards/_accuracy_reward": 0.4437499940395355, + "rewards/_format_reward": 0.625, + "step": 195 + }, + { + "completion_length": 207.125, + "epoch": 0.049, + "grad_norm": 0.4014991819858551, + "kl": 0.004617661237716675, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0002, + "reward": 0.6437499523162842, + "reward_std": 0.7456720471382141, + "rewards/_accuracy_reward": 0.26874998211860657, + "rewards/_format_reward": 0.375, + "step": 196 + }, + { + "completion_length": 229.125, + "epoch": 0.04925, + "grad_norm": 0.4298330247402191, + "kl": 0.0023158607073128223, + "learning_rate": 2.4625e-06, + "loss": 0.0001, + "reward": 0.5375000238418579, + "reward_std": 0.7886471748352051, + "rewards/_accuracy_reward": 0.16249999403953552, + "rewards/_format_reward": 0.375, + "step": 197 + }, + { + "completion_length": 152.125, + "epoch": 0.0495, + "grad_norm": 0.6460537314414978, + "kl": 0.004403269849717617, + "learning_rate": 2.475e-06, + "loss": 0.0002, + "reward": 1.693750023841858, + "reward_std": 0.42714792490005493, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 1.0, + "step": 198 + }, + { + "completion_length": 224.625, + "epoch": 0.04975, + "grad_norm": 0.3826991617679596, + "kl": 0.00469087902456522, + "learning_rate": 2.4875000000000003e-06, + "loss": 0.0002, + "reward": 0.8199999928474426, + "reward_std": 0.7337185144424438, + "rewards/_accuracy_reward": 0.3199999928474426, + "rewards/_format_reward": 0.5, + "step": 199 + }, + { + "completion_length": 233.5, + "epoch": 0.05, + "grad_norm": 0.4221881031990051, + "kl": 0.003704667557030916, + "learning_rate": 2.5e-06, + "loss": 0.0001, + "reward": 0.4437499940395355, + "reward_std": 0.6155354380607605, + "rewards/_accuracy_reward": 0.06875000149011612, + "rewards/_format_reward": 0.375, + "step": 200 + }, + { + "completion_length": 231.75, + "epoch": 0.05025, + "grad_norm": 0.43503835797309875, + "kl": 0.003693893551826477, + "learning_rate": 2.5125e-06, + "loss": 0.0001, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 201 + }, + { + "completion_length": 138.0, + "epoch": 0.0505, + "grad_norm": 0.4424368739128113, + "kl": 0.005755322519689798, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0002, + "reward": 1.0625, + "reward_std": 0.6373774409294128, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 0.75, + "step": 202 + }, + { + "completion_length": 242.875, + "epoch": 0.05075, + "grad_norm": 0.34338605403900146, + "kl": 0.003572634654119611, + "learning_rate": 2.5375e-06, + "loss": 0.0001, + "reward": 0.13749998807907104, + "reward_std": 0.3691205382347107, + "rewards/_accuracy_reward": 0.012500000186264515, + "rewards/_format_reward": 0.125, + "step": 203 + }, + { + "completion_length": 147.25, + "epoch": 0.051, + "grad_norm": 0.5478609204292297, + "kl": 0.005715084727853537, + "learning_rate": 2.55e-06, + "loss": 0.0002, + "reward": 1.162500023841858, + "reward_std": 0.548211395740509, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 0.875, + "step": 204 + }, + { + "completion_length": 208.875, + "epoch": 0.05125, + "grad_norm": 0.4353210926055908, + "kl": 0.006955728400498629, + "learning_rate": 2.5625e-06, + "loss": 0.0003, + "reward": 0.84375, + "reward_std": 0.8857755064964294, + "rewards/_accuracy_reward": 0.34375, + "rewards/_format_reward": 0.5, + "step": 205 + }, + { + "completion_length": 183.125, + "epoch": 0.0515, + "grad_norm": 0.48688188195228577, + "kl": 0.005555164068937302, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0002, + "reward": 1.065000057220459, + "reward_std": 0.7645166516304016, + "rewards/_accuracy_reward": 0.3149999976158142, + "rewards/_format_reward": 0.75, + "step": 206 + }, + { + "completion_length": 161.375, + "epoch": 0.05175, + "grad_norm": 0.5545176267623901, + "kl": 0.005829046946018934, + "learning_rate": 2.5875000000000002e-06, + "loss": 0.0002, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 207 + }, + { + "completion_length": 225.0, + "epoch": 0.052, + "grad_norm": 0.5879900455474854, + "kl": 0.0061422791332006454, + "learning_rate": 2.6e-06, + "loss": 0.0002, + "reward": 0.2874999940395355, + "reward_std": 0.5350233316421509, + "rewards/_accuracy_reward": 0.03750000149011612, + "rewards/_format_reward": 0.25, + "step": 208 + }, + { + "completion_length": 157.5, + "epoch": 0.05225, + "grad_norm": 0.42013055086135864, + "kl": 0.007088819984346628, + "learning_rate": 2.6125e-06, + "loss": 0.0003, + "reward": 1.537500023841858, + "reward_std": 0.7322909235954285, + "rewards/_accuracy_reward": 0.6625000238418579, + "rewards/_format_reward": 0.875, + "step": 209 + }, + { + "completion_length": 191.125, + "epoch": 0.0525, + "grad_norm": 0.3625972867012024, + "kl": 0.0053137680515646935, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0002, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 210 + }, + { + "completion_length": 223.0, + "epoch": 0.05275, + "grad_norm": 0.48341336846351624, + "kl": 0.005899759475141764, + "learning_rate": 2.6375e-06, + "loss": 0.0002, + "reward": 0.3137499988079071, + "reward_std": 0.5151404142379761, + "rewards/_accuracy_reward": 0.0637499988079071, + "rewards/_format_reward": 0.25, + "step": 211 + }, + { + "completion_length": 219.5, + "epoch": 0.053, + "grad_norm": 0.46322962641716003, + "kl": 0.005710378754884005, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0002, + "reward": 0.9387500286102295, + "reward_std": 0.8531528115272522, + "rewards/_accuracy_reward": 0.3137499988079071, + "rewards/_format_reward": 0.625, + "step": 212 + }, + { + "completion_length": 198.125, + "epoch": 0.05325, + "grad_norm": 0.5013320446014404, + "kl": 0.008783817291259766, + "learning_rate": 2.6625e-06, + "loss": 0.0004, + "reward": 0.7825000286102295, + "reward_std": 0.9011222720146179, + "rewards/_accuracy_reward": 0.2824999988079071, + "rewards/_format_reward": 0.5, + "step": 213 + }, + { + "completion_length": 242.625, + "epoch": 0.0535, + "grad_norm": 0.3888327181339264, + "kl": 0.0044942148961126804, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0002, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.25, + "step": 214 + }, + { + "completion_length": 236.75, + "epoch": 0.05375, + "grad_norm": 0.4056166410446167, + "kl": 0.004561163019388914, + "learning_rate": 2.6875e-06, + "loss": 0.0002, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 215 + }, + { + "completion_length": 243.375, + "epoch": 0.054, + "grad_norm": 0.42273250222206116, + "kl": 0.005308468360453844, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0002, + "reward": 0.6312500238418579, + "reward_std": 0.9192146062850952, + "rewards/_accuracy_reward": 0.2562499940395355, + "rewards/_format_reward": 0.375, + "step": 216 + }, + { + "completion_length": 241.125, + "epoch": 0.05425, + "grad_norm": 0.4731215536594391, + "kl": 0.005138865672051907, + "learning_rate": 2.7125000000000003e-06, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.375, + "rewards/_format_reward": 0.375, + "step": 217 + }, + { + "completion_length": 120.625, + "epoch": 0.0545, + "grad_norm": 0.6542100310325623, + "kl": 0.0068123298697173595, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0003, + "reward": 1.1387500762939453, + "reward_std": 0.12017104029655457, + "rewards/_accuracy_reward": 0.26374998688697815, + "rewards/_format_reward": 0.875, + "step": 218 + }, + { + "completion_length": 239.625, + "epoch": 0.05475, + "grad_norm": 0.5177122950553894, + "kl": 0.004553478676825762, + "learning_rate": 2.7375e-06, + "loss": 0.0002, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 0.125, + "step": 219 + }, + { + "completion_length": 161.75, + "epoch": 0.055, + "grad_norm": 0.4810257852077484, + "kl": 0.008091006428003311, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0003, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 220 + }, + { + "completion_length": 207.0, + "epoch": 0.05525, + "grad_norm": 0.40970757603645325, + "kl": 0.007687645964324474, + "learning_rate": 2.7625000000000002e-06, + "loss": 0.0003, + "reward": 1.024999976158142, + "reward_std": 0.7564013004302979, + "rewards/_accuracy_reward": 0.2749999761581421, + "rewards/_format_reward": 0.75, + "step": 221 + }, + { + "completion_length": 154.75, + "epoch": 0.0555, + "grad_norm": 0.5231997966766357, + "kl": 0.009649819694459438, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0004, + "reward": 1.3125, + "reward_std": 0.873723566532135, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.75, + "step": 222 + }, + { + "completion_length": 216.375, + "epoch": 0.05575, + "grad_norm": 0.443153440952301, + "kl": 0.007553863804787397, + "learning_rate": 2.7875000000000004e-06, + "loss": 0.0003, + "reward": 0.71875, + "reward_std": 0.8066409826278687, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.5, + "step": 223 + }, + { + "completion_length": 163.625, + "epoch": 0.056, + "grad_norm": 0.539243221282959, + "kl": 0.009118853136897087, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0004, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.625, + "step": 224 + }, + { + "completion_length": 157.875, + "epoch": 0.05625, + "grad_norm": 0.5195901989936829, + "kl": 0.009252168238162994, + "learning_rate": 2.8125e-06, + "loss": 0.0004, + "reward": 1.287500023841858, + "reward_std": 0.8786149024963379, + "rewards/_accuracy_reward": 0.5375000238418579, + "rewards/_format_reward": 0.75, + "step": 225 + }, + { + "completion_length": 188.75, + "epoch": 0.0565, + "grad_norm": 0.4762548804283142, + "kl": 0.008448407053947449, + "learning_rate": 2.825e-06, + "loss": 0.0003, + "reward": 0.875, + "reward_std": 0.7676494717597961, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 0.625, + "step": 226 + }, + { + "completion_length": 184.375, + "epoch": 0.05675, + "grad_norm": 0.46647635102272034, + "kl": 0.01854267716407776, + "learning_rate": 2.8375000000000004e-06, + "loss": 0.0007, + "reward": 0.7875000238418579, + "reward_std": 0.7024192810058594, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 0.5, + "step": 227 + }, + { + "completion_length": 146.125, + "epoch": 0.057, + "grad_norm": 0.6877365112304688, + "kl": 0.00886484608054161, + "learning_rate": 2.85e-06, + "loss": 0.0004, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 228 + }, + { + "completion_length": 229.0, + "epoch": 0.05725, + "grad_norm": 0.5037513375282288, + "kl": 0.00627094367519021, + "learning_rate": 2.8625e-06, + "loss": 0.0003, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 229 + }, + { + "completion_length": 129.25, + "epoch": 0.0575, + "grad_norm": 1.007373571395874, + "kl": 0.01517564244568348, + "learning_rate": 2.875e-06, + "loss": 0.0006, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 230 + }, + { + "completion_length": 243.625, + "epoch": 0.05775, + "grad_norm": 0.41951367259025574, + "kl": 0.007699695415794849, + "learning_rate": 2.8875000000000003e-06, + "loss": 0.0003, + "reward": 0.3812499940395355, + "reward_std": 0.7502082586288452, + "rewards/_accuracy_reward": 0.13124999403953552, + "rewards/_format_reward": 0.25, + "step": 231 + }, + { + "completion_length": 198.125, + "epoch": 0.058, + "grad_norm": 0.3918209969997406, + "kl": 0.010249249637126923, + "learning_rate": 2.9e-06, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 232 + }, + { + "completion_length": 213.875, + "epoch": 0.05825, + "grad_norm": 0.41451263427734375, + "kl": 0.008266115561127663, + "learning_rate": 2.9125000000000005e-06, + "loss": 0.0003, + "reward": 1.0125000476837158, + "reward_std": 0.9261093735694885, + "rewards/_accuracy_reward": 0.38749998807907104, + "rewards/_format_reward": 0.625, + "step": 233 + }, + { + "completion_length": 160.125, + "epoch": 0.0585, + "grad_norm": 0.4602366089820862, + "kl": 0.011817101389169693, + "learning_rate": 2.925e-06, + "loss": 0.0005, + "reward": 1.1375000476837158, + "reward_std": 0.548211395740509, + "rewards/_accuracy_reward": 0.26249998807907104, + "rewards/_format_reward": 0.875, + "step": 234 + }, + { + "completion_length": 150.125, + "epoch": 0.05875, + "grad_norm": 0.5556851029396057, + "kl": 0.012773082591593266, + "learning_rate": 2.9375000000000003e-06, + "loss": 0.0005, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 235 + }, + { + "completion_length": 236.625, + "epoch": 0.059, + "grad_norm": 0.3992154002189636, + "kl": 0.008578523993492126, + "learning_rate": 2.95e-06, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 1.0690449476242065, + "rewards/_accuracy_reward": 0.5, + "rewards/_format_reward": 0.5, + "step": 236 + }, + { + "completion_length": 223.25, + "epoch": 0.05925, + "grad_norm": 0.4743492901325226, + "kl": 0.009763360023498535, + "learning_rate": 2.9625000000000004e-06, + "loss": 0.0004, + "reward": 0.512499988079071, + "reward_std": 0.765669584274292, + "rewards/_accuracy_reward": 0.13749998807907104, + "rewards/_format_reward": 0.375, + "step": 237 + }, + { + "completion_length": 232.75, + "epoch": 0.0595, + "grad_norm": 0.3819025754928589, + "kl": 0.009855308569967747, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.0004, + "reward": 0.6687500476837158, + "reward_std": 0.7736451625823975, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 0.5, + "step": 238 + }, + { + "completion_length": 219.0, + "epoch": 0.05975, + "grad_norm": 0.6082333922386169, + "kl": 0.013843866065144539, + "learning_rate": 2.9875e-06, + "loss": 0.0006, + "reward": 0.65625, + "reward_std": 0.9348175525665283, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.375, + "step": 239 + }, + { + "completion_length": 243.75, + "epoch": 0.06, + "grad_norm": 0.40308472514152527, + "kl": 0.009744809940457344, + "learning_rate": 3e-06, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.375, + "rewards/_format_reward": 0.375, + "step": 240 + }, + { + "completion_length": 169.375, + "epoch": 0.06025, + "grad_norm": 0.6102287769317627, + "kl": 0.012486739084124565, + "learning_rate": 3.0125000000000004e-06, + "loss": 0.0005, + "reward": 0.971250057220459, + "reward_std": 0.6741225123405457, + "rewards/_accuracy_reward": 0.2212499976158142, + "rewards/_format_reward": 0.75, + "step": 241 + }, + { + "completion_length": 192.125, + "epoch": 0.0605, + "grad_norm": 0.4726582169532776, + "kl": 0.014720995910465717, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0006, + "reward": 0.90625, + "reward_std": 0.6483151316642761, + "rewards/_accuracy_reward": 0.1562499850988388, + "rewards/_format_reward": 0.75, + "step": 242 + }, + { + "completion_length": 207.625, + "epoch": 0.06075, + "grad_norm": 0.4914742708206177, + "kl": 0.019970744848251343, + "learning_rate": 3.0375000000000006e-06, + "loss": 0.0008, + "reward": 1.15625, + "reward_std": 0.9904679656028748, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 0.625, + "step": 243 + }, + { + "completion_length": 153.625, + "epoch": 0.061, + "grad_norm": 0.4838781952857971, + "kl": 0.015723643824458122, + "learning_rate": 3.05e-06, + "loss": 0.0006, + "reward": 1.375, + "reward_std": 0.6681531071662903, + "rewards/_accuracy_reward": 0.5, + "rewards/_format_reward": 0.875, + "step": 244 + }, + { + "completion_length": 202.375, + "epoch": 0.06125, + "grad_norm": 0.47575071454048157, + "kl": 0.01339123584330082, + "learning_rate": 3.0625000000000003e-06, + "loss": 0.0005, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 245 + }, + { + "completion_length": 190.375, + "epoch": 0.0615, + "grad_norm": 0.4257509410381317, + "kl": 0.01585126481950283, + "learning_rate": 3.075e-06, + "loss": 0.0006, + "reward": 1.193750023841858, + "reward_std": 0.8304204940795898, + "rewards/_accuracy_reward": 0.4437499940395355, + "rewards/_format_reward": 0.75, + "step": 246 + }, + { + "completion_length": 168.125, + "epoch": 0.06175, + "grad_norm": 0.4466553330421448, + "kl": 0.015615028329193592, + "learning_rate": 3.0875000000000005e-06, + "loss": 0.0006, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 247 + }, + { + "completion_length": 190.5, + "epoch": 0.062, + "grad_norm": 0.5046932101249695, + "kl": 0.020424310117959976, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0008, + "reward": 0.8062499761581421, + "reward_std": 0.7242915630340576, + "rewards/_accuracy_reward": 0.18125000596046448, + "rewards/_format_reward": 0.625, + "step": 248 + }, + { + "completion_length": 115.125, + "epoch": 0.06225, + "grad_norm": 0.9507879614830017, + "kl": 0.027173198759555817, + "learning_rate": 3.1125000000000007e-06, + "loss": 0.0011, + "reward": 1.4075000286102295, + "reward_std": 0.3749571740627289, + "rewards/_accuracy_reward": 0.4074999988079071, + "rewards/_format_reward": 1.0, + "step": 249 + }, + { + "completion_length": 226.25, + "epoch": 0.0625, + "grad_norm": 0.4514904022216797, + "kl": 0.014395845122635365, + "learning_rate": 3.125e-06, + "loss": 0.0006, + "reward": 1.15625, + "reward_std": 0.9904679656028748, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 0.625, + "step": 250 + }, + { + "completion_length": 183.0, + "epoch": 0.06275, + "grad_norm": 0.482563853263855, + "kl": 0.015461320988833904, + "learning_rate": 3.1375e-06, + "loss": 0.0006, + "reward": 1.46875, + "reward_std": 0.6999680995941162, + "rewards/_accuracy_reward": 0.59375, + "rewards/_format_reward": 0.875, + "step": 251 + }, + { + "completion_length": 210.875, + "epoch": 0.063, + "grad_norm": 0.44888806343078613, + "kl": 0.013685889542102814, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0005, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.875, + "step": 252 + }, + { + "completion_length": 183.625, + "epoch": 0.06325, + "grad_norm": 0.47426795959472656, + "kl": 0.01469960156828165, + "learning_rate": 3.1625000000000002e-06, + "loss": 0.0006, + "reward": 1.381250023841858, + "reward_std": 0.9133679866790771, + "rewards/_accuracy_reward": 0.6312500238418579, + "rewards/_format_reward": 0.75, + "step": 253 + }, + { + "completion_length": 123.25, + "epoch": 0.0635, + "grad_norm": 0.6966858506202698, + "kl": 0.02002432383596897, + "learning_rate": 3.175e-06, + "loss": 0.0008, + "reward": 1.5625, + "reward_std": 0.6648039817810059, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.75, + "step": 254 + }, + { + "completion_length": 194.25, + "epoch": 0.06375, + "grad_norm": 0.5240160226821899, + "kl": 0.025446368381381035, + "learning_rate": 3.1875e-06, + "loss": 0.001, + "reward": 1.100000023841858, + "reward_std": 0.7662525177001953, + "rewards/_accuracy_reward": 0.3499999940395355, + "rewards/_format_reward": 0.75, + "step": 255 + }, + { + "completion_length": 232.125, + "epoch": 0.064, + "grad_norm": 0.46555182337760925, + "kl": 0.013254818506538868, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0005, + "reward": 0.39374998211860657, + "reward_std": 0.5434265732765198, + "rewards/_accuracy_reward": 0.01875000074505806, + "rewards/_format_reward": 0.375, + "step": 256 + }, + { + "completion_length": 209.375, + "epoch": 0.06425, + "grad_norm": 0.4615981876850128, + "kl": 0.01638483628630638, + "learning_rate": 3.2125e-06, + "loss": 0.0007, + "reward": 1.15625, + "reward_std": 0.9904679656028748, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 0.625, + "step": 257 + }, + { + "completion_length": 176.0, + "epoch": 0.0645, + "grad_norm": 0.47257405519485474, + "kl": 0.01924911141395569, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0008, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 258 + }, + { + "completion_length": 188.25, + "epoch": 0.06475, + "grad_norm": 0.538159191608429, + "kl": 0.02660544216632843, + "learning_rate": 3.2375e-06, + "loss": 0.0011, + "reward": 1.193750023841858, + "reward_std": 0.8304204940795898, + "rewards/_accuracy_reward": 0.4437499940395355, + "rewards/_format_reward": 0.75, + "step": 259 + }, + { + "completion_length": 197.375, + "epoch": 0.065, + "grad_norm": 0.46662867069244385, + "kl": 0.021138174459338188, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0008, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 260 + }, + { + "completion_length": 182.25, + "epoch": 0.06525, + "grad_norm": 0.4960637390613556, + "kl": 0.023767707869410515, + "learning_rate": 3.2625e-06, + "loss": 0.001, + "reward": 0.8624999523162842, + "reward_std": 0.5403371453285217, + "rewards/_accuracy_reward": 0.11250000447034836, + "rewards/_format_reward": 0.75, + "step": 261 + }, + { + "completion_length": 100.0, + "epoch": 0.0655, + "grad_norm": 0.8033775687217712, + "kl": 0.021194277331233025, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.0008, + "reward": 1.6687500476837158, + "reward_std": 0.4613160789012909, + "rewards/_accuracy_reward": 0.668749988079071, + "rewards/_format_reward": 1.0, + "step": 262 + }, + { + "completion_length": 147.375, + "epoch": 0.06575, + "grad_norm": 0.5282887816429138, + "kl": 0.02339054085314274, + "learning_rate": 3.2875000000000003e-06, + "loss": 0.0009, + "reward": 1.506250023841858, + "reward_std": 0.4144165515899658, + "rewards/_accuracy_reward": 0.5062500238418579, + "rewards/_format_reward": 1.0, + "step": 263 + }, + { + "completion_length": 190.25, + "epoch": 0.066, + "grad_norm": 0.5130710005760193, + "kl": 0.020366905257105827, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0008, + "reward": 1.46875, + "reward_std": 0.6999680995941162, + "rewards/_accuracy_reward": 0.59375, + "rewards/_format_reward": 0.875, + "step": 264 + }, + { + "completion_length": 177.0, + "epoch": 0.06625, + "grad_norm": 0.6445785760879517, + "kl": 0.02009623870253563, + "learning_rate": 3.3125e-06, + "loss": 0.0008, + "reward": 1.162500023841858, + "reward_std": 0.8327021598815918, + "rewards/_accuracy_reward": 0.5375000238418579, + "rewards/_format_reward": 0.625, + "step": 265 + }, + { + "completion_length": 188.875, + "epoch": 0.0665, + "grad_norm": 0.4418463706970215, + "kl": 0.016711309552192688, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0007, + "reward": 0.90625, + "reward_std": 0.6483150720596313, + "rewards/_accuracy_reward": 0.1562499850988388, + "rewards/_format_reward": 0.75, + "step": 266 + }, + { + "completion_length": 132.625, + "epoch": 0.06675, + "grad_norm": 0.6308992505073547, + "kl": 0.02185102552175522, + "learning_rate": 3.3375000000000002e-06, + "loss": 0.0009, + "reward": 1.5499999523162842, + "reward_std": 0.4855041801929474, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 267 + }, + { + "completion_length": 187.75, + "epoch": 0.067, + "grad_norm": 0.5287885665893555, + "kl": 0.028705699369311333, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0011, + "reward": 1.6375000476837158, + "reward_std": 0.7224709987640381, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 0.875, + "step": 268 + }, + { + "completion_length": 145.25, + "epoch": 0.06725, + "grad_norm": 0.6234681606292725, + "kl": 0.027406934648752213, + "learning_rate": 3.3625000000000004e-06, + "loss": 0.0011, + "reward": 1.3125, + "reward_std": 0.5469068884849548, + "rewards/_accuracy_reward": 0.4375, + "rewards/_format_reward": 0.875, + "step": 269 + }, + { + "completion_length": 77.125, + "epoch": 0.0675, + "grad_norm": 0.047009434551000595, + "kl": 0.03576899319887161, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0014, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 270 + }, + { + "completion_length": 128.125, + "epoch": 0.06775, + "grad_norm": 0.6146723031997681, + "kl": 0.036112938076257706, + "learning_rate": 3.3875e-06, + "loss": 0.0014, + "reward": 1.28125, + "reward_std": 0.6187184453010559, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.875, + "step": 271 + }, + { + "completion_length": 151.375, + "epoch": 0.068, + "grad_norm": 0.5604023337364197, + "kl": 0.044364482164382935, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0018, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 272 + }, + { + "completion_length": 203.375, + "epoch": 0.06825, + "grad_norm": 0.560128390789032, + "kl": 0.018296649679541588, + "learning_rate": 3.4125000000000004e-06, + "loss": 0.0007, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 273 + }, + { + "completion_length": 186.0, + "epoch": 0.0685, + "grad_norm": 0.6812446117401123, + "kl": 0.03642842918634415, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.0015, + "reward": 0.7699999809265137, + "reward_std": 0.7155816555023193, + "rewards/_accuracy_reward": 0.14499999582767487, + "rewards/_format_reward": 0.625, + "step": 274 + }, + { + "completion_length": 199.25, + "epoch": 0.06875, + "grad_norm": 0.5815830230712891, + "kl": 0.036607492715120316, + "learning_rate": 3.4375e-06, + "loss": 0.0015, + "reward": 1.0625, + "reward_std": 0.933025062084198, + "rewards/_accuracy_reward": 0.4375, + "rewards/_format_reward": 0.625, + "step": 275 + }, + { + "completion_length": 185.0, + "epoch": 0.069, + "grad_norm": 1.3024146556854248, + "kl": 0.0446808896958828, + "learning_rate": 3.45e-06, + "loss": 0.0018, + "reward": 1.21875, + "reward_std": 0.8284828662872314, + "rewards/_accuracy_reward": 0.46875, + "rewards/_format_reward": 0.75, + "step": 276 + }, + { + "completion_length": 131.875, + "epoch": 0.06925, + "grad_norm": 0.6265179514884949, + "kl": 0.03437270596623421, + "learning_rate": 3.4625000000000003e-06, + "loss": 0.0014, + "reward": 1.59375, + "reward_std": 0.4419417381286621, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 0.875, + "step": 277 + }, + { + "completion_length": 122.0, + "epoch": 0.0695, + "grad_norm": 0.6202266812324524, + "kl": 0.04929348826408386, + "learning_rate": 3.475e-06, + "loss": 0.002, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 278 + }, + { + "completion_length": 116.5, + "epoch": 0.06975, + "grad_norm": 0.8051105737686157, + "kl": 0.049348097294569016, + "learning_rate": 3.4875000000000005e-06, + "loss": 0.002, + "reward": 1.5750000476837158, + "reward_std": 0.4605897068977356, + "rewards/_accuracy_reward": 0.574999988079071, + "rewards/_format_reward": 1.0, + "step": 279 + }, + { + "completion_length": 202.5, + "epoch": 0.07, + "grad_norm": 0.5659109950065613, + "kl": 0.04884558916091919, + "learning_rate": 3.5e-06, + "loss": 0.002, + "reward": 1.125, + "reward_std": 0.7676494717597961, + "rewards/_accuracy_reward": 0.375, + "rewards/_format_reward": 0.75, + "step": 280 + }, + { + "completion_length": 81.0, + "epoch": 0.07025, + "grad_norm": 1.043556571006775, + "kl": 0.043515197932720184, + "learning_rate": 3.5125000000000003e-06, + "loss": 0.0017, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 281 + }, + { + "completion_length": 187.25, + "epoch": 0.0705, + "grad_norm": 1.5090237855911255, + "kl": 0.07255250960588455, + "learning_rate": 3.525e-06, + "loss": 0.0029, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.625, + "step": 282 + }, + { + "completion_length": 111.375, + "epoch": 0.07075, + "grad_norm": 0.7345578670501709, + "kl": 0.04479851573705673, + "learning_rate": 3.5375000000000004e-06, + "loss": 0.0018, + "reward": 1.5750000476837158, + "reward_std": 0.4605897068977356, + "rewards/_accuracy_reward": 0.574999988079071, + "rewards/_format_reward": 1.0, + "step": 283 + }, + { + "completion_length": 66.25, + "epoch": 0.071, + "grad_norm": 1.220955491065979, + "kl": 0.05355329439043999, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0021, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 284 + }, + { + "completion_length": 124.375, + "epoch": 0.07125, + "grad_norm": 0.8568919897079468, + "kl": 0.040953654795885086, + "learning_rate": 3.5625e-06, + "loss": 0.0016, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 285 + }, + { + "completion_length": 141.25, + "epoch": 0.0715, + "grad_norm": 0.030418027192354202, + "kl": 0.03461394086480141, + "learning_rate": 3.575e-06, + "loss": 0.0014, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 286 + }, + { + "completion_length": 107.5, + "epoch": 0.07175, + "grad_norm": 0.7467533349990845, + "kl": 0.05478259548544884, + "learning_rate": 3.5875000000000004e-06, + "loss": 0.0022, + "reward": 1.4262499809265137, + "reward_std": 0.48056328296661377, + "rewards/_accuracy_reward": 0.42624998092651367, + "rewards/_format_reward": 1.0, + "step": 287 + }, + { + "completion_length": 151.25, + "epoch": 0.072, + "grad_norm": 0.6246471405029297, + "kl": 0.036742065101861954, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0015, + "reward": 1.2625000476837158, + "reward_std": 0.8826704621315002, + "rewards/_accuracy_reward": 0.512499988079071, + "rewards/_format_reward": 0.75, + "step": 288 + }, + { + "completion_length": 126.625, + "epoch": 0.07225, + "grad_norm": 0.7587897777557373, + "kl": 0.06992122530937195, + "learning_rate": 3.6125000000000006e-06, + "loss": 0.0028, + "reward": 1.59375, + "reward_std": 0.4419417381286621, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 0.875, + "step": 289 + }, + { + "completion_length": 170.375, + "epoch": 0.0725, + "grad_norm": 0.5562832951545715, + "kl": 0.053098414093256, + "learning_rate": 3.625e-06, + "loss": 0.0021, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.875, + "step": 290 + }, + { + "completion_length": 54.625, + "epoch": 0.07275, + "grad_norm": 1.5443010330200195, + "kl": 0.07868895679712296, + "learning_rate": 3.6375000000000003e-06, + "loss": 0.0031, + "reward": 1.318750023841858, + "reward_std": 0.28402402997016907, + "rewards/_accuracy_reward": 0.3187499940395355, + "rewards/_format_reward": 1.0, + "step": 291 + }, + { + "completion_length": 196.25, + "epoch": 0.073, + "grad_norm": 0.6690995097160339, + "kl": 0.029539842158555984, + "learning_rate": 3.65e-06, + "loss": 0.0012, + "reward": 0.668749988079071, + "reward_std": 0.7736451625823975, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 0.5, + "step": 292 + }, + { + "completion_length": 174.125, + "epoch": 0.07325, + "grad_norm": 0.6143000721931458, + "kl": 0.05302724614739418, + "learning_rate": 3.6625000000000005e-06, + "loss": 0.0021, + "reward": 1.4375, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.75, + "step": 293 + }, + { + "completion_length": 118.125, + "epoch": 0.0735, + "grad_norm": 0.7825801372528076, + "kl": 0.06216191127896309, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0025, + "reward": 1.600000023841858, + "reward_std": 0.43260011076927185, + "rewards/_accuracy_reward": 0.6000000238418579, + "rewards/_format_reward": 1.0, + "step": 294 + }, + { + "completion_length": 181.25, + "epoch": 0.07375, + "grad_norm": 0.6875461935997009, + "kl": 0.03431488573551178, + "learning_rate": 3.6875000000000007e-06, + "loss": 0.0014, + "reward": 1.46875, + "reward_std": 0.6999680995941162, + "rewards/_accuracy_reward": 0.59375, + "rewards/_format_reward": 0.875, + "step": 295 + }, + { + "completion_length": 99.75, + "epoch": 0.074, + "grad_norm": 0.972676157951355, + "kl": 0.05988858640193939, + "learning_rate": 3.7e-06, + "loss": 0.0024, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 296 + }, + { + "completion_length": 120.5, + "epoch": 0.07425, + "grad_norm": 0.6937150955200195, + "kl": 0.06560764461755753, + "learning_rate": 3.7125000000000005e-06, + "loss": 0.0026, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 297 + }, + { + "completion_length": 110.0, + "epoch": 0.0745, + "grad_norm": 1.0664499998092651, + "kl": 0.08323174715042114, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0033, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 298 + }, + { + "completion_length": 64.75, + "epoch": 0.07475, + "grad_norm": 0.9208748936653137, + "kl": 0.051380615681409836, + "learning_rate": 3.7375000000000006e-06, + "loss": 0.0021, + "reward": 1.4812500476837158, + "reward_std": 0.4374745190143585, + "rewards/_accuracy_reward": 0.48124998807907104, + "rewards/_format_reward": 1.0, + "step": 299 + }, + { + "completion_length": 148.0, + "epoch": 0.075, + "grad_norm": 0.6525527238845825, + "kl": 0.05159568786621094, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0021, + "reward": 1.087499976158142, + "reward_std": 0.5442885160446167, + "rewards/_accuracy_reward": 0.2124999761581421, + "rewards/_format_reward": 0.875, + "step": 300 + }, + { + "completion_length": 167.25, + "epoch": 0.07525, + "grad_norm": 0.7984333038330078, + "kl": 0.06883440166711807, + "learning_rate": 3.7625e-06, + "loss": 0.0028, + "reward": 1.6875, + "reward_std": 0.6373774409294128, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.875, + "step": 301 + }, + { + "completion_length": 104.625, + "epoch": 0.0755, + "grad_norm": 0.7018251419067383, + "kl": 0.04696748033165932, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0019, + "reward": 1.3624999523162842, + "reward_std": 0.404218852519989, + "rewards/_accuracy_reward": 0.36249998211860657, + "rewards/_format_reward": 1.0, + "step": 302 + }, + { + "completion_length": 148.375, + "epoch": 0.07575, + "grad_norm": 0.07361488789319992, + "kl": 0.08332055807113647, + "learning_rate": 3.7875e-06, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 303 + }, + { + "completion_length": 132.625, + "epoch": 0.076, + "grad_norm": 0.7021421194076538, + "kl": 0.05917055159807205, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0024, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 304 + }, + { + "completion_length": 175.5, + "epoch": 0.07625, + "grad_norm": 0.6244091391563416, + "kl": 0.06270697712898254, + "learning_rate": 3.8125e-06, + "loss": 0.0025, + "reward": 1.2687499523162842, + "reward_std": 0.699968159198761, + "rewards/_accuracy_reward": 0.518750011920929, + "rewards/_format_reward": 0.75, + "step": 305 + }, + { + "completion_length": 98.25, + "epoch": 0.0765, + "grad_norm": 1.3864850997924805, + "kl": 0.078142911195755, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0031, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.28749996423721313, + "rewards/_format_reward": 1.0, + "step": 306 + }, + { + "completion_length": 134.5, + "epoch": 0.07675, + "grad_norm": 0.5813739895820618, + "kl": 0.041886672377586365, + "learning_rate": 3.8375e-06, + "loss": 0.0017, + "reward": 1.2312500476837158, + "reward_std": 0.6284547448158264, + "rewards/_accuracy_reward": 0.35624998807907104, + "rewards/_format_reward": 0.875, + "step": 307 + }, + { + "completion_length": 168.5, + "epoch": 0.077, + "grad_norm": 0.5561500191688538, + "kl": 0.06397830694913864, + "learning_rate": 3.85e-06, + "loss": 0.0026, + "reward": 1.3250000476837158, + "reward_std": 0.6850443482398987, + "rewards/_accuracy_reward": 0.44999998807907104, + "rewards/_format_reward": 0.875, + "step": 308 + }, + { + "completion_length": 126.125, + "epoch": 0.07725, + "grad_norm": 0.6728265881538391, + "kl": 0.057980045676231384, + "learning_rate": 3.8625e-06, + "loss": 0.0023, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 309 + }, + { + "completion_length": 105.25, + "epoch": 0.0775, + "grad_norm": 0.9413458108901978, + "kl": 0.10995390266180038, + "learning_rate": 3.875e-06, + "loss": 0.0044, + "reward": 1.5950000286102295, + "reward_std": 0.4400324523448944, + "rewards/_accuracy_reward": 0.5950000286102295, + "rewards/_format_reward": 1.0, + "step": 310 + }, + { + "completion_length": 108.5, + "epoch": 0.07775, + "grad_norm": 0.742570698261261, + "kl": 0.06104608625173569, + "learning_rate": 3.8875000000000005e-06, + "loss": 0.0024, + "reward": 1.7575000524520874, + "reward_std": 0.449150025844574, + "rewards/_accuracy_reward": 0.7575000524520874, + "rewards/_format_reward": 1.0, + "step": 311 + }, + { + "completion_length": 110.75, + "epoch": 0.078, + "grad_norm": 1.0395474433898926, + "kl": 0.088965505361557, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0036, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.643750011920929, + "rewards/_format_reward": 1.0, + "step": 312 + }, + { + "completion_length": 144.875, + "epoch": 0.07825, + "grad_norm": 0.6619213223457336, + "kl": 0.06248822063207626, + "learning_rate": 3.9125e-06, + "loss": 0.0025, + "reward": 1.0625, + "reward_std": 0.5403372049331665, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.875, + "step": 313 + }, + { + "completion_length": 109.0, + "epoch": 0.0785, + "grad_norm": 0.7599800229072571, + "kl": 0.0571325309574604, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0023, + "reward": 1.4075000286102295, + "reward_std": 0.3749571442604065, + "rewards/_accuracy_reward": 0.4074999988079071, + "rewards/_format_reward": 1.0, + "step": 314 + }, + { + "completion_length": 73.0, + "epoch": 0.07875, + "grad_norm": 0.9942233562469482, + "kl": 0.05285262688994408, + "learning_rate": 3.9375e-06, + "loss": 0.0021, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 315 + }, + { + "completion_length": 163.0, + "epoch": 0.079, + "grad_norm": 0.7015470266342163, + "kl": 0.06908886879682541, + "learning_rate": 3.95e-06, + "loss": 0.0028, + "reward": 1.3250000476837158, + "reward_std": 0.6850443482398987, + "rewards/_accuracy_reward": 0.44999998807907104, + "rewards/_format_reward": 0.875, + "step": 316 + }, + { + "completion_length": 72.875, + "epoch": 0.07925, + "grad_norm": 1.0622810125350952, + "kl": 0.10289790481328964, + "learning_rate": 3.962500000000001e-06, + "loss": 0.0041, + "reward": 1.658750057220459, + "reward_std": 0.47675803303718567, + "rewards/_accuracy_reward": 0.6587499976158142, + "rewards/_format_reward": 1.0, + "step": 317 + }, + { + "completion_length": 72.0, + "epoch": 0.0795, + "grad_norm": 0.8196299076080322, + "kl": 0.05889907479286194, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0024, + "reward": 1.412500023841858, + "reward_std": 0.3691205680370331, + "rewards/_accuracy_reward": 0.4124999940395355, + "rewards/_format_reward": 1.0, + "step": 318 + }, + { + "completion_length": 106.25, + "epoch": 0.07975, + "grad_norm": 0.8157685399055481, + "kl": 0.06252449005842209, + "learning_rate": 3.9875e-06, + "loss": 0.0025, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 319 + }, + { + "completion_length": 101.25, + "epoch": 0.08, + "grad_norm": 0.7154927253723145, + "kl": 0.05639846250414848, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0023, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 320 + }, + { + "completion_length": 118.75, + "epoch": 0.08025, + "grad_norm": 0.950957179069519, + "kl": 0.11179396510124207, + "learning_rate": 4.0125e-06, + "loss": 0.0045, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 321 + }, + { + "completion_length": 111.125, + "epoch": 0.0805, + "grad_norm": 0.7619085907936096, + "kl": 0.060069490224123, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0024, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 322 + }, + { + "completion_length": 128.5, + "epoch": 0.08075, + "grad_norm": 0.9389228224754333, + "kl": 0.0730435773730278, + "learning_rate": 4.037500000000001e-06, + "loss": 0.0029, + "reward": 1.1687499284744263, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 1.0, + "step": 323 + }, + { + "completion_length": 158.5, + "epoch": 0.081, + "grad_norm": 0.846824049949646, + "kl": 0.08881374448537827, + "learning_rate": 4.05e-06, + "loss": 0.0036, + "reward": 1.1875, + "reward_std": 0.9519716501235962, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.625, + "step": 324 + }, + { + "completion_length": 155.375, + "epoch": 0.08125, + "grad_norm": 0.58637934923172, + "kl": 0.08695843815803528, + "learning_rate": 4.0625000000000005e-06, + "loss": 0.0035, + "reward": 0.9187499284744263, + "reward_std": 0.3514637351036072, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 0.75, + "step": 325 + }, + { + "completion_length": 84.625, + "epoch": 0.0815, + "grad_norm": 1.0473264455795288, + "kl": 0.06300223618745804, + "learning_rate": 4.075e-06, + "loss": 0.0025, + "reward": 1.5499999523162842, + "reward_std": 0.4855042099952698, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 326 + }, + { + "completion_length": 139.75, + "epoch": 0.08175, + "grad_norm": 0.7010840177536011, + "kl": 0.05449886992573738, + "learning_rate": 4.0875e-06, + "loss": 0.0022, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 327 + }, + { + "completion_length": 97.5, + "epoch": 0.082, + "grad_norm": 0.8196879029273987, + "kl": 0.09262411296367645, + "learning_rate": 4.1e-06, + "loss": 0.0037, + "reward": 1.5499999523162842, + "reward_std": 0.4855042099952698, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 328 + }, + { + "completion_length": 124.25, + "epoch": 0.08225, + "grad_norm": 1.0033892393112183, + "kl": 0.10067097842693329, + "learning_rate": 4.1125e-06, + "loss": 0.004, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 329 + }, + { + "completion_length": 155.125, + "epoch": 0.0825, + "grad_norm": 0.6117260456085205, + "kl": 0.0802069678902626, + "learning_rate": 4.125e-06, + "loss": 0.0032, + "reward": 1.1687499284744263, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 1.0, + "step": 330 + }, + { + "completion_length": 90.625, + "epoch": 0.08275, + "grad_norm": 1.334694743156433, + "kl": 0.11455602198839188, + "learning_rate": 4.137500000000001e-06, + "loss": 0.0046, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 331 + }, + { + "completion_length": 76.5, + "epoch": 0.083, + "grad_norm": 1.0724419355392456, + "kl": 0.09013547003269196, + "learning_rate": 4.15e-06, + "loss": 0.0036, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 332 + }, + { + "completion_length": 102.875, + "epoch": 0.08325, + "grad_norm": 1.0087305307388306, + "kl": 0.0930456668138504, + "learning_rate": 4.1625e-06, + "loss": 0.0037, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 333 + }, + { + "completion_length": 137.875, + "epoch": 0.0835, + "grad_norm": 0.6068035960197449, + "kl": 0.0930265486240387, + "learning_rate": 4.175e-06, + "loss": 0.0037, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 334 + }, + { + "completion_length": 83.375, + "epoch": 0.08375, + "grad_norm": 1.1466654539108276, + "kl": 0.09693353623151779, + "learning_rate": 4.1875e-06, + "loss": 0.0039, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 335 + }, + { + "completion_length": 135.125, + "epoch": 0.084, + "grad_norm": 1.0346399545669556, + "kl": 0.07693766057491302, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0031, + "reward": 1.5700000524520874, + "reward_std": 0.46727174520492554, + "rewards/_accuracy_reward": 0.5699999928474426, + "rewards/_format_reward": 1.0, + "step": 336 + }, + { + "completion_length": 127.375, + "epoch": 0.08425, + "grad_norm": 0.7949957847595215, + "kl": 0.16617698967456818, + "learning_rate": 4.212500000000001e-06, + "loss": 0.0066, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 337 + }, + { + "completion_length": 98.25, + "epoch": 0.0845, + "grad_norm": 0.9801748394966125, + "kl": 0.09285181015729904, + "learning_rate": 4.225e-06, + "loss": 0.0037, + "reward": 1.5499999523162842, + "reward_std": 0.4855041801929474, + "rewards/_accuracy_reward": 0.550000011920929, + "rewards/_format_reward": 1.0, + "step": 338 + }, + { + "completion_length": 129.0, + "epoch": 0.08475, + "grad_norm": 1.0321601629257202, + "kl": 0.08525305241346359, + "learning_rate": 4.2375000000000005e-06, + "loss": 0.0034, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 339 + }, + { + "completion_length": 117.0, + "epoch": 0.085, + "grad_norm": 1.1353120803833008, + "kl": 0.10381244868040085, + "learning_rate": 4.25e-06, + "loss": 0.0042, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 340 + }, + { + "completion_length": 67.5, + "epoch": 0.08525, + "grad_norm": 0.987204372882843, + "kl": 0.1275581568479538, + "learning_rate": 4.2625e-06, + "loss": 0.0051, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 341 + }, + { + "completion_length": 88.25, + "epoch": 0.0855, + "grad_norm": 1.1097989082336426, + "kl": 0.10488829016685486, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0042, + "reward": 1.7575000524520874, + "reward_std": 0.449150025844574, + "rewards/_accuracy_reward": 0.7574999928474426, + "rewards/_format_reward": 1.0, + "step": 342 + }, + { + "completion_length": 101.625, + "epoch": 0.08575, + "grad_norm": 1.0648112297058105, + "kl": 0.11403176933526993, + "learning_rate": 4.287500000000001e-06, + "loss": 0.0046, + "reward": 1.3875000476837158, + "reward_std": 0.3879893124103546, + "rewards/_accuracy_reward": 0.38749998807907104, + "rewards/_format_reward": 1.0, + "step": 343 + }, + { + "completion_length": 98.875, + "epoch": 0.086, + "grad_norm": 0.8387040495872498, + "kl": 0.11271817982196808, + "learning_rate": 4.3e-06, + "loss": 0.0045, + "reward": 1.756250023841858, + "reward_std": 0.6894291639328003, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 344 + }, + { + "completion_length": 128.125, + "epoch": 0.08625, + "grad_norm": 0.7294446229934692, + "kl": 0.12787966430187225, + "learning_rate": 4.312500000000001e-06, + "loss": 0.0051, + "reward": 1.1762499809265137, + "reward_std": 0.6348889470100403, + "rewards/_accuracy_reward": 0.30125001072883606, + "rewards/_format_reward": 0.875, + "step": 345 + }, + { + "completion_length": 159.25, + "epoch": 0.0865, + "grad_norm": 0.8217050433158875, + "kl": 0.14619140326976776, + "learning_rate": 4.325e-06, + "loss": 0.0058, + "reward": 1.162500023841858, + "reward_std": 0.832702100276947, + "rewards/_accuracy_reward": 0.5375000238418579, + "rewards/_format_reward": 0.625, + "step": 346 + }, + { + "completion_length": 129.25, + "epoch": 0.08675, + "grad_norm": 0.9098784923553467, + "kl": 0.13450828194618225, + "learning_rate": 4.3375e-06, + "loss": 0.0054, + "reward": 1.431249976158142, + "reward_std": 0.47579824924468994, + "rewards/_accuracy_reward": 0.4312499761581421, + "rewards/_format_reward": 1.0, + "step": 347 + }, + { + "completion_length": 151.0, + "epoch": 0.087, + "grad_norm": 0.7144613265991211, + "kl": 0.1879410594701767, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0075, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.875, + "step": 348 + }, + { + "completion_length": 84.375, + "epoch": 0.08725, + "grad_norm": 1.6553195714950562, + "kl": 0.10268331319093704, + "learning_rate": 4.362500000000001e-06, + "loss": 0.0041, + "reward": 1.375, + "reward_std": 0.6681531071662903, + "rewards/_accuracy_reward": 0.5, + "rewards/_format_reward": 0.875, + "step": 349 + }, + { + "completion_length": 61.25, + "epoch": 0.0875, + "grad_norm": 1.4001189470291138, + "kl": 0.18523964285850525, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0074, + "reward": 1.5012500286102295, + "reward_std": 0.42089828848838806, + "rewards/_accuracy_reward": 0.5012500286102295, + "rewards/_format_reward": 1.0, + "step": 350 + }, + { + "completion_length": 139.375, + "epoch": 0.08775, + "grad_norm": 0.5610023140907288, + "kl": 0.09610922634601593, + "learning_rate": 4.3875e-06, + "loss": 0.0038, + "reward": 1.375, + "reward_std": 0.6681531071662903, + "rewards/_accuracy_reward": 0.5, + "rewards/_format_reward": 0.875, + "step": 351 + }, + { + "completion_length": 167.5, + "epoch": 0.088, + "grad_norm": 0.8529553413391113, + "kl": 0.1177278384566307, + "learning_rate": 4.4e-06, + "loss": 0.0047, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.2812499701976776, + "rewards/_format_reward": 0.875, + "step": 352 + }, + { + "completion_length": 134.75, + "epoch": 0.08825, + "grad_norm": 0.7346218824386597, + "kl": 0.12559470534324646, + "learning_rate": 4.4125000000000005e-06, + "loss": 0.005, + "reward": 1.274999976158142, + "reward_std": 0.6974443197250366, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.875, + "step": 353 + }, + { + "completion_length": 131.375, + "epoch": 0.0885, + "grad_norm": 0.7096778154373169, + "kl": 0.15666167438030243, + "learning_rate": 4.425e-06, + "loss": 0.0063, + "reward": 1.6875, + "reward_std": 0.6373774409294128, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.875, + "step": 354 + }, + { + "completion_length": 129.5, + "epoch": 0.08875, + "grad_norm": 0.7917588949203491, + "kl": 0.16912636160850525, + "learning_rate": 4.4375e-06, + "loss": 0.0068, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 355 + }, + { + "completion_length": 108.75, + "epoch": 0.089, + "grad_norm": 1.4511607885360718, + "kl": 0.1251247525215149, + "learning_rate": 4.450000000000001e-06, + "loss": 0.005, + "reward": 1.5012500286102295, + "reward_std": 0.42089828848838806, + "rewards/_accuracy_reward": 0.5012500286102295, + "rewards/_format_reward": 1.0, + "step": 356 + }, + { + "completion_length": 114.25, + "epoch": 0.08925, + "grad_norm": 0.9404997229576111, + "kl": 0.09536008536815643, + "learning_rate": 4.4625e-06, + "loss": 0.0038, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 357 + }, + { + "completion_length": 101.25, + "epoch": 0.0895, + "grad_norm": 0.7459734082221985, + "kl": 0.1477159559726715, + "learning_rate": 4.475e-06, + "loss": 0.0059, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 358 + }, + { + "completion_length": 85.625, + "epoch": 0.08975, + "grad_norm": 1.143880009651184, + "kl": 0.09666404128074646, + "learning_rate": 4.4875e-06, + "loss": 0.0039, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 359 + }, + { + "completion_length": 204.375, + "epoch": 0.09, + "grad_norm": 0.9317293167114258, + "kl": 0.31011974811553955, + "learning_rate": 4.5e-06, + "loss": 0.0124, + "reward": 0.762499988079071, + "reward_std": 0.8907260298728943, + "rewards/_accuracy_reward": 0.26249998807907104, + "rewards/_format_reward": 0.5, + "step": 360 + }, + { + "completion_length": 164.0, + "epoch": 0.09025, + "grad_norm": 0.8388033509254456, + "kl": 0.1918019950389862, + "learning_rate": 4.5125e-06, + "loss": 0.0077, + "reward": 1.0512499809265137, + "reward_std": 0.7511028051376343, + "rewards/_accuracy_reward": 0.30124998092651367, + "rewards/_format_reward": 0.75, + "step": 361 + }, + { + "completion_length": 73.125, + "epoch": 0.0905, + "grad_norm": 1.0067369937896729, + "kl": 0.09683433175086975, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0039, + "reward": 1.243749976158142, + "reward_std": 0.32120034098625183, + "rewards/_accuracy_reward": 0.24374999105930328, + "rewards/_format_reward": 1.0, + "step": 362 + }, + { + "completion_length": 175.625, + "epoch": 0.09075, + "grad_norm": 0.79710453748703, + "kl": 0.15141978859901428, + "learning_rate": 4.5375e-06, + "loss": 0.0061, + "reward": 1.03125, + "reward_std": 0.9300297498703003, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 0.5, + "step": 363 + }, + { + "completion_length": 106.25, + "epoch": 0.091, + "grad_norm": 0.7696452736854553, + "kl": 0.13706472516059875, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0055, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 364 + }, + { + "completion_length": 102.375, + "epoch": 0.09125, + "grad_norm": 0.6263626217842102, + "kl": 0.18292012810707092, + "learning_rate": 4.5625e-06, + "loss": 0.0073, + "reward": 1.09375, + "reward_std": 0.4419417381286621, + "rewards/_accuracy_reward": 0.21875, + "rewards/_format_reward": 0.875, + "step": 365 + }, + { + "completion_length": 123.75, + "epoch": 0.0915, + "grad_norm": 0.6248298287391663, + "kl": 0.0651947632431984, + "learning_rate": 4.575e-06, + "loss": 0.0026, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 366 + }, + { + "completion_length": 108.5, + "epoch": 0.09175, + "grad_norm": 1.123427391052246, + "kl": 0.1323918253183365, + "learning_rate": 4.5875000000000005e-06, + "loss": 0.0053, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 367 + }, + { + "completion_length": 159.5, + "epoch": 0.092, + "grad_norm": 0.7406381368637085, + "kl": 0.1296067088842392, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.643750011920929, + "rewards/_format_reward": 1.0, + "step": 368 + }, + { + "completion_length": 164.75, + "epoch": 0.09225, + "grad_norm": 0.717776358127594, + "kl": 0.1630459874868393, + "learning_rate": 4.6125e-06, + "loss": 0.0065, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 369 + }, + { + "completion_length": 86.25, + "epoch": 0.0925, + "grad_norm": 1.1500803232192993, + "kl": 0.12640973925590515, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0051, + "reward": 1.756250023841858, + "reward_std": 0.45153507590293884, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 370 + }, + { + "completion_length": 161.375, + "epoch": 0.09275, + "grad_norm": 0.8110669255256653, + "kl": 0.13621176779270172, + "learning_rate": 4.6375e-06, + "loss": 0.0054, + "reward": 1.1437499523162842, + "reward_std": 0.8317097425460815, + "rewards/_accuracy_reward": 0.39374998211860657, + "rewards/_format_reward": 0.75, + "step": 371 + }, + { + "completion_length": 175.375, + "epoch": 0.093, + "grad_norm": 0.7024534344673157, + "kl": 0.1695256382226944, + "learning_rate": 4.65e-06, + "loss": 0.0068, + "reward": 1.1875, + "reward_std": 0.9519716501235962, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.625, + "step": 372 + }, + { + "completion_length": 145.125, + "epoch": 0.09325, + "grad_norm": 1.101328730583191, + "kl": 0.17406335473060608, + "learning_rate": 4.662500000000001e-06, + "loss": 0.007, + "reward": 1.381250023841858, + "reward_std": 0.9133679866790771, + "rewards/_accuracy_reward": 0.6312500238418579, + "rewards/_format_reward": 0.75, + "step": 373 + }, + { + "completion_length": 164.5, + "epoch": 0.0935, + "grad_norm": 0.740450918674469, + "kl": 0.15864543616771698, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0063, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 374 + }, + { + "completion_length": 200.25, + "epoch": 0.09375, + "grad_norm": 0.7958067059516907, + "kl": 0.35587310791015625, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.0142, + "reward": 1.125, + "reward_std": 0.9910312294960022, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.5, + "step": 375 + }, + { + "completion_length": 119.125, + "epoch": 0.094, + "grad_norm": 0.9200760126113892, + "kl": 0.07643859833478928, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 376 + }, + { + "completion_length": 120.75, + "epoch": 0.09425, + "grad_norm": 0.7784951329231262, + "kl": 0.09547189623117447, + "learning_rate": 4.7125e-06, + "loss": 0.0038, + "reward": 1.225000023841858, + "reward_std": 0.8180989027023315, + "rewards/_accuracy_reward": 0.4749999940395355, + "rewards/_format_reward": 0.75, + "step": 377 + }, + { + "completion_length": 76.0, + "epoch": 0.0945, + "grad_norm": 1.4544790983200073, + "kl": 0.16331760585308075, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 378 + }, + { + "completion_length": 105.75, + "epoch": 0.09475, + "grad_norm": 0.9481674432754517, + "kl": 0.1655467301607132, + "learning_rate": 4.737500000000001e-06, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 379 + }, + { + "completion_length": 117.625, + "epoch": 0.095, + "grad_norm": 1.2458184957504272, + "kl": 0.12308754026889801, + "learning_rate": 4.75e-06, + "loss": 0.0049, + "reward": 1.2625000476837158, + "reward_std": 0.31707367300987244, + "rewards/_accuracy_reward": 0.38749998807907104, + "rewards/_format_reward": 0.875, + "step": 380 + }, + { + "completion_length": 141.875, + "epoch": 0.09525, + "grad_norm": 2.0550785064697266, + "kl": 0.24554966390132904, + "learning_rate": 4.7625000000000006e-06, + "loss": 0.0098, + "reward": 1.21875, + "reward_std": 0.8284828662872314, + "rewards/_accuracy_reward": 0.46875, + "rewards/_format_reward": 0.75, + "step": 381 + }, + { + "completion_length": 134.375, + "epoch": 0.0955, + "grad_norm": 0.8591959476470947, + "kl": 0.10585917532444, + "learning_rate": 4.775e-06, + "loss": 0.0042, + "reward": 1.6387500762939453, + "reward_std": 0.49872517585754395, + "rewards/_accuracy_reward": 0.6387499570846558, + "rewards/_format_reward": 1.0, + "step": 382 + }, + { + "completion_length": 114.25, + "epoch": 0.09575, + "grad_norm": 0.794729471206665, + "kl": 0.24316060543060303, + "learning_rate": 4.7875e-06, + "loss": 0.0097, + "reward": 1.0625, + "reward_std": 0.5403371453285217, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.875, + "step": 383 + }, + { + "completion_length": 122.5, + "epoch": 0.096, + "grad_norm": 1.0003390312194824, + "kl": 0.10567362606525421, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0042, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 384 + }, + { + "completion_length": 96.375, + "epoch": 0.09625, + "grad_norm": 0.9128509163856506, + "kl": 0.09176424145698547, + "learning_rate": 4.8125e-06, + "loss": 0.0037, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 385 + }, + { + "completion_length": 87.0, + "epoch": 0.0965, + "grad_norm": 0.8376882672309875, + "kl": 0.08349757641553879, + "learning_rate": 4.825e-06, + "loss": 0.0033, + "reward": 1.53125, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 1.0, + "step": 386 + }, + { + "completion_length": 116.125, + "epoch": 0.09675, + "grad_norm": 1.173961877822876, + "kl": 0.08239471912384033, + "learning_rate": 4.837500000000001e-06, + "loss": 0.0033, + "reward": 1.3624999523162842, + "reward_std": 0.404218852519989, + "rewards/_accuracy_reward": 0.36249998211860657, + "rewards/_format_reward": 1.0, + "step": 387 + }, + { + "completion_length": 122.375, + "epoch": 0.097, + "grad_norm": 0.07301829755306244, + "kl": 0.10658504068851471, + "learning_rate": 4.85e-06, + "loss": 0.0043, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 388 + }, + { + "completion_length": 89.125, + "epoch": 0.09725, + "grad_norm": 0.2543293237686157, + "kl": 0.1679508537054062, + "learning_rate": 4.8625000000000005e-06, + "loss": 0.0067, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 389 + }, + { + "completion_length": 116.125, + "epoch": 0.0975, + "grad_norm": 0.9531389474868774, + "kl": 0.10527552664279938, + "learning_rate": 4.875e-06, + "loss": 0.0042, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 390 + }, + { + "completion_length": 118.875, + "epoch": 0.09775, + "grad_norm": 1.4362037181854248, + "kl": 0.2108982801437378, + "learning_rate": 4.8875e-06, + "loss": 0.0084, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 391 + }, + { + "completion_length": 112.25, + "epoch": 0.098, + "grad_norm": 0.7744788527488708, + "kl": 0.09784118831157684, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0039, + "reward": 1.662500023841858, + "reward_std": 0.4711308181285858, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 0.875, + "step": 392 + }, + { + "completion_length": 125.75, + "epoch": 0.09825, + "grad_norm": 0.11483877152204514, + "kl": 0.09153156727552414, + "learning_rate": 4.912500000000001e-06, + "loss": 0.0037, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 393 + }, + { + "completion_length": 119.25, + "epoch": 0.0985, + "grad_norm": 0.9200050234794617, + "kl": 0.12948504090309143, + "learning_rate": 4.925e-06, + "loss": 0.0052, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 394 + }, + { + "completion_length": 175.375, + "epoch": 0.09875, + "grad_norm": 0.7395903468132019, + "kl": 0.12225595861673355, + "learning_rate": 4.937500000000001e-06, + "loss": 0.0049, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 395 + }, + { + "completion_length": 122.5, + "epoch": 0.099, + "grad_norm": 0.6254833936691284, + "kl": 0.0814957544207573, + "learning_rate": 4.95e-06, + "loss": 0.0033, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 396 + }, + { + "completion_length": 150.875, + "epoch": 0.09925, + "grad_norm": 1.205034613609314, + "kl": 0.08875016123056412, + "learning_rate": 4.9625e-06, + "loss": 0.0036, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 397 + }, + { + "completion_length": 161.125, + "epoch": 0.0995, + "grad_norm": 0.9321796894073486, + "kl": 0.16619841754436493, + "learning_rate": 4.975000000000001e-06, + "loss": 0.0066, + "reward": 1.7512500286102295, + "reward_std": 0.4606032371520996, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 0.875, + "step": 398 + }, + { + "completion_length": 159.25, + "epoch": 0.09975, + "grad_norm": 0.8920565843582153, + "kl": 0.12888400256633759, + "learning_rate": 4.987500000000001e-06, + "loss": 0.0052, + "reward": 1.274999976158142, + "reward_std": 0.6974443197250366, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.875, + "step": 399 + }, + { + "completion_length": 118.5, + "epoch": 0.1, + "grad_norm": 0.8786934018135071, + "kl": 0.09915917366743088, + "learning_rate": 5e-06, + "loss": 0.004, + "reward": 1.6875, + "reward_std": 0.4381372928619385, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.875, + "step": 400 + }, + { + "completion_length": 77.625, + "epoch": 0.10025, + "grad_norm": 1.2590632438659668, + "kl": 0.14523717761039734, + "learning_rate": 4.999999048070624e-06, + "loss": 0.0058, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 401 + }, + { + "completion_length": 63.125, + "epoch": 0.1005, + "grad_norm": 1.158711314201355, + "kl": 0.10946811735630035, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0044, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 402 + }, + { + "completion_length": 115.375, + "epoch": 0.10075, + "grad_norm": 1.5086749792099, + "kl": 0.7569040656089783, + "learning_rate": 4.999991432639962e-06, + "loss": 0.0303, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 403 + }, + { + "completion_length": 153.375, + "epoch": 0.101, + "grad_norm": 0.7452853322029114, + "kl": 0.08908268809318542, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0036, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.28749996423721313, + "rewards/_format_reward": 1.0, + "step": 404 + }, + { + "completion_length": 83.75, + "epoch": 0.10125, + "grad_norm": 1.3645873069763184, + "kl": 0.10320331156253815, + "learning_rate": 4.999976201801837e-06, + "loss": 0.0041, + "reward": 1.125, + "reward_std": 0.10350986570119858, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 1.0, + "step": 405 + }, + { + "completion_length": 72.25, + "epoch": 0.1015, + "grad_norm": 1.1549677848815918, + "kl": 0.09673086553812027, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0039, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 406 + }, + { + "completion_length": 96.125, + "epoch": 0.10175, + "grad_norm": 1.009154200553894, + "kl": 0.1394815593957901, + "learning_rate": 4.999953355602643e-06, + "loss": 0.0056, + "reward": 1.399999976158142, + "reward_std": 0.4971346855163574, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 0.875, + "step": 407 + }, + { + "completion_length": 125.375, + "epoch": 0.102, + "grad_norm": 0.9860761165618896, + "kl": 0.13221774995326996, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0053, + "reward": 1.149999976158142, + "reward_std": 0.8220185041427612, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.75, + "step": 408 + }, + { + "completion_length": 60.375, + "epoch": 0.10225, + "grad_norm": 0.053071990609169006, + "kl": 0.0779157504439354, + "learning_rate": 4.999922894111975e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 409 + }, + { + "completion_length": 89.875, + "epoch": 0.1025, + "grad_norm": 0.9673873782157898, + "kl": 0.08007065951824188, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.0032, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 410 + }, + { + "completion_length": 113.25, + "epoch": 0.10275, + "grad_norm": 0.6946626901626587, + "kl": 0.09533973038196564, + "learning_rate": 4.9998848174226225e-06, + "loss": 0.0038, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 411 + }, + { + "completion_length": 96.125, + "epoch": 0.103, + "grad_norm": 0.05451874062418938, + "kl": 0.07452833652496338, + "learning_rate": 4.999862923413781e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 412 + }, + { + "completion_length": 72.0, + "epoch": 0.10325, + "grad_norm": 0.9213165044784546, + "kl": 0.1262635439634323, + "learning_rate": 4.999839125650576e-06, + "loss": 0.0051, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 413 + }, + { + "completion_length": 67.125, + "epoch": 0.1035, + "grad_norm": 1.0785224437713623, + "kl": 0.0754222422838211, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.003, + "reward": 1.3125, + "reward_std": 0.4299086928367615, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 1.0, + "step": 414 + }, + { + "completion_length": 157.125, + "epoch": 0.10375, + "grad_norm": 1.1292070150375366, + "kl": 0.23137228190898895, + "learning_rate": 4.999785818935018e-06, + "loss": 0.0093, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 415 + }, + { + "completion_length": 142.25, + "epoch": 0.104, + "grad_norm": 0.8795533180236816, + "kl": 0.10361534357070923, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0041, + "reward": 1.3125, + "reward_std": 0.873723566532135, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.75, + "step": 416 + }, + { + "completion_length": 116.0, + "epoch": 0.10425, + "grad_norm": 0.8660876750946045, + "kl": 0.06504642218351364, + "learning_rate": 4.999724897438332e-06, + "loss": 0.0026, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 417 + }, + { + "completion_length": 76.125, + "epoch": 0.1045, + "grad_norm": 0.8043219447135925, + "kl": 0.11721104383468628, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0047, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 418 + }, + { + "completion_length": 134.5, + "epoch": 0.10475, + "grad_norm": 0.8124399185180664, + "kl": 0.1422225832939148, + "learning_rate": 4.999656361346094e-06, + "loss": 0.0057, + "reward": 1.2687499523162842, + "reward_std": 0.87257319688797, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.75, + "step": 419 + }, + { + "completion_length": 113.125, + "epoch": 0.105, + "grad_norm": 0.2114391177892685, + "kl": 0.1519714593887329, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0061, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 420 + }, + { + "completion_length": 89.25, + "epoch": 0.10525, + "grad_norm": 0.5262160301208496, + "kl": 0.061721399426460266, + "learning_rate": 4.9995802108670775e-06, + "loss": 0.0025, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 421 + }, + { + "completion_length": 101.125, + "epoch": 0.1055, + "grad_norm": 0.956619918346405, + "kl": 0.07579920440912247, + "learning_rate": 4.999539280304111e-06, + "loss": 0.003, + "reward": 1.756250023841858, + "reward_std": 0.45153507590293884, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 422 + }, + { + "completion_length": 110.875, + "epoch": 0.10575, + "grad_norm": 1.250651478767395, + "kl": 0.1732567399740219, + "learning_rate": 4.999496446233249e-06, + "loss": 0.0069, + "reward": 1.568750023841858, + "reward_std": 0.4689939618110657, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 0.875, + "step": 423 + }, + { + "completion_length": 121.5, + "epoch": 0.106, + "grad_norm": 0.8256289958953857, + "kl": 0.1623517870903015, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0065, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 424 + }, + { + "completion_length": 83.125, + "epoch": 0.10625, + "grad_norm": 0.12795695662498474, + "kl": 0.08748316764831543, + "learning_rate": 4.999405067699773e-06, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 425 + }, + { + "completion_length": 69.5, + "epoch": 0.1065, + "grad_norm": 1.4006377458572388, + "kl": 0.09922429174184799, + "learning_rate": 4.999356523306746e-06, + "loss": 0.004, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.28749996423721313, + "rewards/_format_reward": 1.0, + "step": 426 + }, + { + "completion_length": 68.375, + "epoch": 0.10675, + "grad_norm": 1.326810598373413, + "kl": 0.10130297392606735, + "learning_rate": 4.999306075545002e-06, + "loss": 0.0041, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 427 + }, + { + "completion_length": 106.75, + "epoch": 0.107, + "grad_norm": 1.5847952365875244, + "kl": 0.14548355340957642, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0058, + "reward": 1.53125, + "reward_std": 0.8705242872238159, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 428 + }, + { + "completion_length": 76.75, + "epoch": 0.10725, + "grad_norm": 1.0535387992858887, + "kl": 0.08334135264158249, + "learning_rate": 4.999199470070484e-06, + "loss": 0.0033, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 429 + }, + { + "completion_length": 105.75, + "epoch": 0.1075, + "grad_norm": 0.04739993438124657, + "kl": 0.05667338892817497, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 430 + }, + { + "completion_length": 97.375, + "epoch": 0.10775, + "grad_norm": 0.050007615238428116, + "kl": 0.09176231920719147, + "learning_rate": 4.9990852516009556e-06, + "loss": 0.0037, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 431 + }, + { + "completion_length": 68.5, + "epoch": 0.108, + "grad_norm": 1.226663589477539, + "kl": 0.08610677719116211, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0034, + "reward": 1.318750023841858, + "reward_std": 0.2840240001678467, + "rewards/_accuracy_reward": 0.3187499940395355, + "rewards/_format_reward": 1.0, + "step": 432 + }, + { + "completion_length": 135.125, + "epoch": 0.10825, + "grad_norm": 0.9774854183197021, + "kl": 0.060582954436540604, + "learning_rate": 4.998963420484349e-06, + "loss": 0.0024, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 433 + }, + { + "completion_length": 108.25, + "epoch": 0.1085, + "grad_norm": 0.0652063712477684, + "kl": 0.0928327664732933, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0037, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 434 + }, + { + "completion_length": 116.5, + "epoch": 0.10875, + "grad_norm": 0.066441610455513, + "kl": 0.08606898039579391, + "learning_rate": 4.998833977091783e-06, + "loss": 0.0034, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 435 + }, + { + "completion_length": 76.25, + "epoch": 0.109, + "grad_norm": 0.06395157426595688, + "kl": 0.08062107861042023, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 436 + }, + { + "completion_length": 112.375, + "epoch": 0.10925, + "grad_norm": 0.9348214864730835, + "kl": 0.11126627773046494, + "learning_rate": 4.998696921817562e-06, + "loss": 0.0045, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 437 + }, + { + "completion_length": 98.5, + "epoch": 0.1095, + "grad_norm": 0.9661799669265747, + "kl": 0.06792579591274261, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0027, + "reward": 1.5499999523162842, + "reward_std": 0.4855041801929474, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 438 + }, + { + "completion_length": 121.625, + "epoch": 0.10975, + "grad_norm": 0.9441984295845032, + "kl": 0.06458202749490738, + "learning_rate": 4.998552255079182e-06, + "loss": 0.0026, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 1.0, + "step": 439 + }, + { + "completion_length": 135.375, + "epoch": 0.11, + "grad_norm": 1.015769124031067, + "kl": 0.05865727737545967, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0023, + "reward": 1.5625, + "reward_std": 0.47715675830841064, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.75, + "step": 440 + }, + { + "completion_length": 156.25, + "epoch": 0.11025, + "grad_norm": 0.6271827220916748, + "kl": 0.08216940611600876, + "learning_rate": 4.998399977317323e-06, + "loss": 0.0033, + "reward": 1.5012500286102295, + "reward_std": 0.42089828848838806, + "rewards/_accuracy_reward": 0.5012500286102295, + "rewards/_format_reward": 1.0, + "step": 441 + }, + { + "completion_length": 143.5, + "epoch": 0.1105, + "grad_norm": 0.7138703465461731, + "kl": 0.09817658364772797, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0039, + "reward": 1.7512500286102295, + "reward_std": 0.7035712003707886, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 0.875, + "step": 442 + }, + { + "completion_length": 126.75, + "epoch": 0.11075, + "grad_norm": 0.5848399996757507, + "kl": 0.06327465921640396, + "learning_rate": 4.9982400889958494e-06, + "loss": 0.0025, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 443 + }, + { + "completion_length": 126.75, + "epoch": 0.111, + "grad_norm": 1.716501235961914, + "kl": 0.430374413728714, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0172, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.625, + "step": 444 + }, + { + "completion_length": 97.375, + "epoch": 0.11125, + "grad_norm": 1.0113590955734253, + "kl": 0.08199731260538101, + "learning_rate": 4.998072590601808e-06, + "loss": 0.0033, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 445 + }, + { + "completion_length": 163.5, + "epoch": 0.1115, + "grad_norm": 0.5446489453315735, + "kl": 0.08332154899835587, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.0033, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 446 + }, + { + "completion_length": 105.25, + "epoch": 0.11175, + "grad_norm": 0.846171498298645, + "kl": 0.08622215688228607, + "learning_rate": 4.997897482645428e-06, + "loss": 0.0034, + "reward": 1.4249999523162842, + "reward_std": 0.481812059879303, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 0.875, + "step": 447 + }, + { + "completion_length": 116.25, + "epoch": 0.112, + "grad_norm": 0.7110444903373718, + "kl": 0.05666430667042732, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0023, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 448 + }, + { + "completion_length": 92.125, + "epoch": 0.11225, + "grad_norm": 1.340246558189392, + "kl": 0.09173227101564407, + "learning_rate": 4.9977147656601196e-06, + "loss": 0.0037, + "reward": 1.5187499523162842, + "reward_std": 0.514738142490387, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 0.875, + "step": 449 + }, + { + "completion_length": 92.25, + "epoch": 0.1125, + "grad_norm": 1.8258260488510132, + "kl": 0.14632035791873932, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0059, + "reward": 1.475000023841858, + "reward_std": 0.6419835090637207, + "rewards/_accuracy_reward": 0.6000000238418579, + "rewards/_format_reward": 0.875, + "step": 450 + }, + { + "completion_length": 172.625, + "epoch": 0.11275, + "grad_norm": 0.7036133408546448, + "kl": 0.07693489640951157, + "learning_rate": 4.997524440202469e-06, + "loss": 0.0031, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.625, + "step": 451 + }, + { + "completion_length": 168.0, + "epoch": 0.113, + "grad_norm": 0.6177405118942261, + "kl": 0.08349208533763885, + "learning_rate": 4.997426424476787e-06, + "loss": 0.0033, + "reward": 1.2949999570846558, + "reward_std": 0.6939947009086609, + "rewards/_accuracy_reward": 0.41999998688697815, + "rewards/_format_reward": 0.875, + "step": 452 + }, + { + "completion_length": 161.25, + "epoch": 0.11325, + "grad_norm": 0.7319234013557434, + "kl": 0.1011638194322586, + "learning_rate": 4.997326506852242e-06, + "loss": 0.004, + "reward": 1.28125, + "reward_std": 0.9949650168418884, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.625, + "step": 453 + }, + { + "completion_length": 143.25, + "epoch": 0.1135, + "grad_norm": 0.7980057597160339, + "kl": 0.08353295922279358, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0033, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 454 + }, + { + "completion_length": 138.125, + "epoch": 0.11375, + "grad_norm": 0.7856406569480896, + "kl": 0.09998480975627899, + "learning_rate": 4.9971209662123774e-06, + "loss": 0.004, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 455 + }, + { + "completion_length": 143.75, + "epoch": 0.114, + "grad_norm": 0.48554107546806335, + "kl": 0.05186166986823082, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.0021, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 456 + }, + { + "completion_length": 79.625, + "epoch": 0.11425, + "grad_norm": 0.03818913921713829, + "kl": 0.07487285137176514, + "learning_rate": 4.996907818908987e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 457 + }, + { + "completion_length": 117.75, + "epoch": 0.1145, + "grad_norm": 0.8235962390899658, + "kl": 0.0846768468618393, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0034, + "reward": 1.53125, + "reward_std": 0.8705242872238159, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 458 + }, + { + "completion_length": 204.0, + "epoch": 0.11475, + "grad_norm": 0.5379483103752136, + "kl": 0.09781916439533234, + "learning_rate": 4.996687065591355e-06, + "loss": 0.0039, + "reward": 1.2937500476837158, + "reward_std": 0.6945900321006775, + "rewards/_accuracy_reward": 0.543749988079071, + "rewards/_format_reward": 0.75, + "step": 459 + }, + { + "completion_length": 136.75, + "epoch": 0.115, + "grad_norm": 0.7759735584259033, + "kl": 0.0745742917060852, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.003, + "reward": 1.568750023841858, + "reward_std": 0.6589697003364563, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 0.875, + "step": 460 + }, + { + "completion_length": 87.0, + "epoch": 0.11525, + "grad_norm": 1.2910523414611816, + "kl": 0.07611493766307831, + "learning_rate": 4.996458706931935e-06, + "loss": 0.003, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 461 + }, + { + "completion_length": 133.375, + "epoch": 0.1155, + "grad_norm": 0.7070830464363098, + "kl": 0.10267248749732971, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0041, + "reward": 1.6875, + "reward_std": 0.6373774409294128, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.875, + "step": 462 + }, + { + "completion_length": 107.625, + "epoch": 0.11575, + "grad_norm": 0.16717670857906342, + "kl": 0.13669300079345703, + "learning_rate": 4.996222743626346e-06, + "loss": 0.0055, + "reward": 1.25, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 1.0, + "step": 463 + }, + { + "completion_length": 74.125, + "epoch": 0.116, + "grad_norm": 1.4139927625656128, + "kl": 0.1574665755033493, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0063, + "reward": 1.34375, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.34375, + "rewards/_format_reward": 1.0, + "step": 464 + }, + { + "completion_length": 151.375, + "epoch": 0.11625, + "grad_norm": 0.7854687571525574, + "kl": 0.10802503675222397, + "learning_rate": 4.995979176393372e-06, + "loss": 0.0043, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 465 + }, + { + "completion_length": 136.75, + "epoch": 0.1165, + "grad_norm": 0.6209039688110352, + "kl": 0.1174168810248375, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0047, + "reward": 1.568750023841858, + "reward_std": 0.7009878158569336, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 0.875, + "step": 466 + }, + { + "completion_length": 147.25, + "epoch": 0.11675, + "grad_norm": 0.5902235507965088, + "kl": 0.10785003006458282, + "learning_rate": 4.995728005974964e-06, + "loss": 0.0043, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.2812499701976776, + "rewards/_format_reward": 0.875, + "step": 467 + }, + { + "completion_length": 133.375, + "epoch": 0.117, + "grad_norm": 0.8462874889373779, + "kl": 0.2404460906982422, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0096, + "reward": 1.274999976158142, + "reward_std": 0.6974443197250366, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.875, + "step": 468 + }, + { + "completion_length": 131.125, + "epoch": 0.11725, + "grad_norm": 0.5348749756813049, + "kl": 0.0859207808971405, + "learning_rate": 4.9954692331362295e-06, + "loss": 0.0034, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 469 + }, + { + "completion_length": 141.25, + "epoch": 0.1175, + "grad_norm": 0.6675086617469788, + "kl": 0.21905027329921722, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0088, + "reward": 1.78125, + "reward_std": 0.6187184453010559, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 470 + }, + { + "completion_length": 135.25, + "epoch": 0.11775, + "grad_norm": 0.5653254985809326, + "kl": 0.1933996081352234, + "learning_rate": 4.995202858665434e-06, + "loss": 0.0077, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 471 + }, + { + "completion_length": 155.875, + "epoch": 0.118, + "grad_norm": 0.1728929877281189, + "kl": 0.16971825063228607, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0068, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 472 + }, + { + "completion_length": 130.5, + "epoch": 0.11825, + "grad_norm": 0.8266690969467163, + "kl": 0.0714937224984169, + "learning_rate": 4.9949288833740016e-06, + "loss": 0.0029, + "reward": 1.3312499523162842, + "reward_std": 0.671319305896759, + "rewards/_accuracy_reward": 0.45624998211860657, + "rewards/_format_reward": 0.875, + "step": 473 + }, + { + "completion_length": 111.125, + "epoch": 0.1185, + "grad_norm": 0.8442752957344055, + "kl": 0.10157648473978043, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0041, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 474 + }, + { + "completion_length": 121.0, + "epoch": 0.11875, + "grad_norm": 0.6787703037261963, + "kl": 0.0763428583741188, + "learning_rate": 4.994647308096509e-06, + "loss": 0.0031, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 475 + }, + { + "completion_length": 163.375, + "epoch": 0.119, + "grad_norm": 0.028817906975746155, + "kl": 0.04729737713932991, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 476 + }, + { + "completion_length": 121.5, + "epoch": 0.11925, + "grad_norm": 0.6946129202842712, + "kl": 0.20381557941436768, + "learning_rate": 4.994358133690683e-06, + "loss": 0.0082, + "reward": 1.3250000476837158, + "reward_std": 0.6850443482398987, + "rewards/_accuracy_reward": 0.44999998807907104, + "rewards/_format_reward": 0.875, + "step": 477 + }, + { + "completion_length": 115.5, + "epoch": 0.1195, + "grad_norm": 16.712684631347656, + "kl": 0.10329899191856384, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0041, + "reward": 1.443750023841858, + "reward_std": 0.7123590111732483, + "rewards/_accuracy_reward": 0.5687500238418579, + "rewards/_format_reward": 0.875, + "step": 478 + }, + { + "completion_length": 112.375, + "epoch": 0.11975, + "grad_norm": 0.08893012255430222, + "kl": 0.07471595704555511, + "learning_rate": 4.9940613610373974e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 479 + }, + { + "completion_length": 161.625, + "epoch": 0.12, + "grad_norm": 0.5868445038795471, + "kl": 0.08950529247522354, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0036, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 480 + }, + { + "completion_length": 141.625, + "epoch": 0.12025, + "grad_norm": 0.5067136883735657, + "kl": 0.11062958091497421, + "learning_rate": 4.993756991040676e-06, + "loss": 0.0044, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 481 + }, + { + "completion_length": 114.125, + "epoch": 0.1205, + "grad_norm": 0.1355297863483429, + "kl": 0.15943318605422974, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0064, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 482 + }, + { + "completion_length": 163.0, + "epoch": 0.12075, + "grad_norm": 1.0221439599990845, + "kl": 0.14336055517196655, + "learning_rate": 4.99344502462768e-06, + "loss": 0.0057, + "reward": 1.131250023841858, + "reward_std": 0.9902876615524292, + "rewards/_accuracy_reward": 0.5062500238418579, + "rewards/_format_reward": 0.625, + "step": 483 + }, + { + "completion_length": 182.375, + "epoch": 0.121, + "grad_norm": 0.9383127689361572, + "kl": 0.12501807510852814, + "learning_rate": 4.993286193061145e-06, + "loss": 0.005, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.625, + "step": 484 + }, + { + "completion_length": 150.25, + "epoch": 0.12125, + "grad_norm": 1.1651649475097656, + "kl": 0.10007007420063019, + "learning_rate": 4.993125462748714e-06, + "loss": 0.004, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 485 + }, + { + "completion_length": 108.0, + "epoch": 0.1215, + "grad_norm": 0.028756048530340195, + "kl": 0.06177728250622749, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 486 + }, + { + "completion_length": 85.0, + "epoch": 0.12175, + "grad_norm": 0.8098493814468384, + "kl": 0.1391671895980835, + "learning_rate": 4.9927983063772205e-06, + "loss": 0.0056, + "reward": 1.318750023841858, + "reward_std": 0.28402402997016907, + "rewards/_accuracy_reward": 0.3187499940395355, + "rewards/_format_reward": 1.0, + "step": 487 + }, + { + "completion_length": 120.125, + "epoch": 0.122, + "grad_norm": 1.6718604564666748, + "kl": 0.07330606877803802, + "learning_rate": 4.992631880567301e-06, + "loss": 0.0029, + "reward": 1.537500023841858, + "reward_std": 0.7322909235954285, + "rewards/_accuracy_reward": 0.6625000238418579, + "rewards/_format_reward": 0.875, + "step": 488 + }, + { + "completion_length": 136.375, + "epoch": 0.12225, + "grad_norm": 0.7673369646072388, + "kl": 0.10325151681900024, + "learning_rate": 4.992463556509772e-06, + "loss": 0.0041, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 489 + }, + { + "completion_length": 142.375, + "epoch": 0.1225, + "grad_norm": 1.1908446550369263, + "kl": 0.1146053820848465, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0046, + "reward": 1.3875000476837158, + "reward_std": 0.3879893124103546, + "rewards/_accuracy_reward": 0.38749998807907104, + "rewards/_format_reward": 1.0, + "step": 490 + }, + { + "completion_length": 78.625, + "epoch": 0.12275, + "grad_norm": 1.0218267440795898, + "kl": 0.20096728205680847, + "learning_rate": 4.992121214166077e-06, + "loss": 0.008, + "reward": 1.5499999523162842, + "reward_std": 0.4855042099952698, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 491 + }, + { + "completion_length": 115.125, + "epoch": 0.123, + "grad_norm": 0.7811625003814697, + "kl": 0.07716540992259979, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0031, + "reward": 1.1687499284744263, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 1.0, + "step": 492 + }, + { + "completion_length": 176.375, + "epoch": 0.12325, + "grad_norm": 0.835605800151825, + "kl": 0.18739749491214752, + "learning_rate": 4.991771280388967e-06, + "loss": 0.0075, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.875, + "step": 493 + }, + { + "completion_length": 117.625, + "epoch": 0.1235, + "grad_norm": 0.06079603359103203, + "kl": 0.06408681720495224, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0026, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 494 + }, + { + "completion_length": 96.5, + "epoch": 0.12375, + "grad_norm": 0.0451425276696682, + "kl": 0.05102433264255524, + "learning_rate": 4.991413756244404e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 495 + }, + { + "completion_length": 109.5, + "epoch": 0.124, + "grad_norm": 0.8194900751113892, + "kl": 0.12500625848770142, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.005, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 496 + }, + { + "completion_length": 106.875, + "epoch": 0.12425, + "grad_norm": 0.21397340297698975, + "kl": 0.1683153510093689, + "learning_rate": 4.991048642821466e-06, + "loss": 0.0067, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 497 + }, + { + "completion_length": 122.375, + "epoch": 0.1245, + "grad_norm": 1.4960228204727173, + "kl": 0.11880878359079361, + "learning_rate": 4.990863240477266e-06, + "loss": 0.0048, + "reward": 1.5187499523162842, + "reward_std": 0.514738142490387, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 0.875, + "step": 498 + }, + { + "completion_length": 150.125, + "epoch": 0.12475, + "grad_norm": 0.8118491768836975, + "kl": 0.15849874913692474, + "learning_rate": 4.990675941232353e-06, + "loss": 0.0063, + "reward": 0.9437499046325684, + "reward_std": 0.38770151138305664, + "rewards/_accuracy_reward": 0.06875000149011612, + "rewards/_format_reward": 0.875, + "step": 499 + }, + { + "completion_length": 136.75, + "epoch": 0.125, + "grad_norm": 0.6899111866950989, + "kl": 0.1458391398191452, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0058, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 500 + }, + { + "completion_length": 160.75, + "epoch": 0.12525, + "grad_norm": 0.6339772939682007, + "kl": 0.16045698523521423, + "learning_rate": 4.990295652612379e-06, + "loss": 0.0064, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 501 + }, + { + "completion_length": 113.375, + "epoch": 0.1255, + "grad_norm": 0.7603309154510498, + "kl": 0.10802485048770905, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0043, + "reward": 1.21875, + "reward_std": 0.4712729752063751, + "rewards/_accuracy_reward": 0.34375, + "rewards/_format_reward": 0.875, + "step": 502 + }, + { + "completion_length": 121.0, + "epoch": 0.12575, + "grad_norm": 0.5087311267852783, + "kl": 0.08297999203205109, + "learning_rate": 4.989907778119969e-06, + "loss": 0.0033, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 503 + }, + { + "completion_length": 164.375, + "epoch": 0.126, + "grad_norm": 0.7062379121780396, + "kl": 0.16635379195213318, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0067, + "reward": 1.40625, + "reward_std": 0.7311622500419617, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.625, + "step": 504 + }, + { + "completion_length": 126.25, + "epoch": 0.12625, + "grad_norm": 0.9284823536872864, + "kl": 0.07494886219501495, + "learning_rate": 4.989512318936654e-06, + "loss": 0.003, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 505 + }, + { + "completion_length": 161.375, + "epoch": 0.1265, + "grad_norm": 0.6040777564048767, + "kl": 0.14252394437789917, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0057, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 506 + }, + { + "completion_length": 130.25, + "epoch": 0.12675, + "grad_norm": 0.06497879326343536, + "kl": 0.06290639191865921, + "learning_rate": 4.989109276267074e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 507 + }, + { + "completion_length": 135.875, + "epoch": 0.127, + "grad_norm": 0.672325074672699, + "kl": 0.05711538344621658, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0023, + "reward": 1.46875, + "reward_std": 0.6999680995941162, + "rewards/_accuracy_reward": 0.59375, + "rewards/_format_reward": 0.875, + "step": 508 + }, + { + "completion_length": 184.875, + "epoch": 0.12725, + "grad_norm": 0.994357705116272, + "kl": 0.13488022983074188, + "learning_rate": 4.988698651338965e-06, + "loss": 0.0054, + "reward": 1.125, + "reward_std": 0.9910312294960022, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.5, + "step": 509 + }, + { + "completion_length": 155.75, + "epoch": 0.1275, + "grad_norm": 0.030594639480113983, + "kl": 0.06842894107103348, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 510 + }, + { + "completion_length": 163.25, + "epoch": 0.12775, + "grad_norm": 0.887022852897644, + "kl": 0.19011199474334717, + "learning_rate": 4.988280445403164e-06, + "loss": 0.0076, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 511 + }, + { + "completion_length": 150.5, + "epoch": 0.128, + "grad_norm": 0.8398552536964417, + "kl": 0.154772087931633, + "learning_rate": 4.988068499954578e-06, + "loss": 0.0062, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 512 + }, + { + "completion_length": 143.0, + "epoch": 0.12825, + "grad_norm": 0.8649221062660217, + "kl": 0.07180348038673401, + "learning_rate": 4.987854659733597e-06, + "loss": 0.0029, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 513 + }, + { + "completion_length": 183.75, + "epoch": 0.1285, + "grad_norm": 0.36669495701789856, + "kl": 0.0827406495809555, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0033, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 514 + }, + { + "completion_length": 133.875, + "epoch": 0.12875, + "grad_norm": 0.7541155219078064, + "kl": 0.09912529587745667, + "learning_rate": 4.987421295627279e-06, + "loss": 0.004, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 515 + }, + { + "completion_length": 126.125, + "epoch": 0.129, + "grad_norm": 1.4548908472061157, + "kl": 0.16541288793087006, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0066, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 516 + }, + { + "completion_length": 124.375, + "epoch": 0.12925, + "grad_norm": 0.8262905478477478, + "kl": 0.21291877329349518, + "learning_rate": 4.986980354404316e-06, + "loss": 0.0085, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.2812499701976776, + "rewards/_format_reward": 0.875, + "step": 517 + }, + { + "completion_length": 94.625, + "epoch": 0.1295, + "grad_norm": 1.0423915386199951, + "kl": 0.10086517781019211, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.004, + "reward": 1.693750023841858, + "reward_std": 0.42714792490005493, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 1.0, + "step": 518 + }, + { + "completion_length": 137.125, + "epoch": 0.12975, + "grad_norm": 0.5806828141212463, + "kl": 0.1275208294391632, + "learning_rate": 4.986531837407891e-06, + "loss": 0.0051, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 519 + }, + { + "completion_length": 115.5, + "epoch": 0.13, + "grad_norm": 1.0486258268356323, + "kl": 0.2358919382095337, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0094, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.875, + "step": 520 + }, + { + "completion_length": 102.75, + "epoch": 0.13025, + "grad_norm": 0.8502905368804932, + "kl": 0.11098451167345047, + "learning_rate": 4.986075746004262e-06, + "loss": 0.0044, + "reward": 1.53125, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 1.0, + "step": 521 + }, + { + "completion_length": 123.0, + "epoch": 0.1305, + "grad_norm": 0.04550248757004738, + "kl": 0.10682176798582077, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0043, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 522 + }, + { + "completion_length": 158.0, + "epoch": 0.13075, + "grad_norm": 0.6766717433929443, + "kl": 0.04881787300109863, + "learning_rate": 4.985612081582763e-06, + "loss": 0.002, + "reward": 0.9187499284744263, + "reward_std": 0.3712310194969177, + "rewards/_accuracy_reward": 0.04375000298023224, + "rewards/_format_reward": 0.875, + "step": 523 + }, + { + "completion_length": 189.25, + "epoch": 0.131, + "grad_norm": 0.7428447604179382, + "kl": 0.1956254243850708, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 524 + }, + { + "completion_length": 97.25, + "epoch": 0.13125, + "grad_norm": 0.7122312784194946, + "kl": 0.08156180381774902, + "learning_rate": 4.985140845555799e-06, + "loss": 0.0033, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 525 + }, + { + "completion_length": 187.75, + "epoch": 0.1315, + "grad_norm": 0.7845567464828491, + "kl": 0.1910717934370041, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0076, + "reward": 1.0325000286102295, + "reward_std": 0.9299885034561157, + "rewards/_accuracy_reward": 0.4074999988079071, + "rewards/_format_reward": 0.625, + "step": 526 + }, + { + "completion_length": 183.0, + "epoch": 0.13175, + "grad_norm": 0.7677600979804993, + "kl": 0.17733940482139587, + "learning_rate": 4.984662039358835e-06, + "loss": 0.0071, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 527 + }, + { + "completion_length": 142.375, + "epoch": 0.132, + "grad_norm": 0.6115749478340149, + "kl": 0.21568211913108826, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0086, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 528 + }, + { + "completion_length": 104.75, + "epoch": 0.13225, + "grad_norm": 0.6236662268638611, + "kl": 0.09865246713161469, + "learning_rate": 4.9841756644503965e-06, + "loss": 0.0039, + "reward": 1.0374999046325684, + "reward_std": 0.5350233316421509, + "rewards/_accuracy_reward": 0.16250000894069672, + "rewards/_format_reward": 0.875, + "step": 529 + }, + { + "completion_length": 85.75, + "epoch": 0.1325, + "grad_norm": 1.077025294303894, + "kl": 0.11589670181274414, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0046, + "reward": 1.190000057220459, + "reward_std": 0.11109840869903564, + "rewards/_accuracy_reward": 0.1899999976158142, + "rewards/_format_reward": 1.0, + "step": 530 + }, + { + "completion_length": 125.125, + "epoch": 0.13275, + "grad_norm": 0.9445363283157349, + "kl": 0.12175693362951279, + "learning_rate": 4.983681722312068e-06, + "loss": 0.0049, + "reward": 1.3250000476837158, + "reward_std": 0.6850443482398987, + "rewards/_accuracy_reward": 0.44999998807907104, + "rewards/_format_reward": 0.875, + "step": 531 + }, + { + "completion_length": 148.875, + "epoch": 0.133, + "grad_norm": 0.7169702053070068, + "kl": 0.0948939323425293, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0038, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 532 + }, + { + "completion_length": 131.625, + "epoch": 0.13325, + "grad_norm": 0.7590733766555786, + "kl": 0.08492686599493027, + "learning_rate": 4.983180214448481e-06, + "loss": 0.0034, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 533 + }, + { + "completion_length": 120.375, + "epoch": 0.1335, + "grad_norm": 0.895210325717926, + "kl": 0.12446754425764084, + "learning_rate": 4.982926623846216e-06, + "loss": 0.005, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 534 + }, + { + "completion_length": 85.25, + "epoch": 0.13375, + "grad_norm": 1.56959068775177, + "kl": 0.09629300981760025, + "learning_rate": 4.982671142387316e-06, + "loss": 0.0039, + "reward": 1.34375, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.34375, + "rewards/_format_reward": 1.0, + "step": 535 + }, + { + "completion_length": 105.75, + "epoch": 0.134, + "grad_norm": 1.4607653617858887, + "kl": 0.20348867774009705, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0081, + "reward": 1.5950000286102295, + "reward_std": 0.4400324523448944, + "rewards/_accuracy_reward": 0.5950000286102295, + "rewards/_format_reward": 1.0, + "step": 536 + }, + { + "completion_length": 100.375, + "epoch": 0.13425, + "grad_norm": 0.8848427534103394, + "kl": 0.21129947900772095, + "learning_rate": 4.982154507679296e-06, + "loss": 0.0085, + "reward": 1.2062499523162842, + "reward_std": 0.6315725445747375, + "rewards/_accuracy_reward": 0.33125001192092896, + "rewards/_format_reward": 0.875, + "step": 537 + }, + { + "completion_length": 166.875, + "epoch": 0.1345, + "grad_norm": 1.3390650749206543, + "kl": 0.2360527515411377, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0094, + "reward": 1.193750023841858, + "reward_std": 0.8304204940795898, + "rewards/_accuracy_reward": 0.4437499940395355, + "rewards/_format_reward": 0.75, + "step": 538 + }, + { + "completion_length": 165.5, + "epoch": 0.13475, + "grad_norm": 0.8016129732131958, + "kl": 0.16088564693927765, + "learning_rate": 4.981630311898178e-06, + "loss": 0.0064, + "reward": 1.149999976158142, + "reward_std": 0.6358346939086914, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.75, + "step": 539 + }, + { + "completion_length": 131.625, + "epoch": 0.135, + "grad_norm": 1.0546795129776, + "kl": 0.13393646478652954, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0054, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 540 + }, + { + "completion_length": 82.0, + "epoch": 0.13525, + "grad_norm": 0.7583869695663452, + "kl": 0.15737389028072357, + "learning_rate": 4.981098556640754e-06, + "loss": 0.0063, + "reward": 1.0749999284744263, + "reward_std": 0.0707106813788414, + "rewards/_accuracy_reward": 0.07500000298023224, + "rewards/_format_reward": 1.0, + "step": 541 + }, + { + "completion_length": 77.375, + "epoch": 0.1355, + "grad_norm": 1.210227370262146, + "kl": 0.14557315409183502, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0058, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 542 + }, + { + "completion_length": 117.125, + "epoch": 0.13575, + "grad_norm": 0.8977431654930115, + "kl": 0.10112278908491135, + "learning_rate": 4.980559243526844e-06, + "loss": 0.004, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 543 + }, + { + "completion_length": 110.375, + "epoch": 0.136, + "grad_norm": 0.9589873552322388, + "kl": 0.23000037670135498, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0092, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 544 + }, + { + "completion_length": 136.5, + "epoch": 0.13625, + "grad_norm": 1.4247018098831177, + "kl": 0.224748432636261, + "learning_rate": 4.980012374199288e-06, + "loss": 0.009, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 545 + }, + { + "completion_length": 109.0, + "epoch": 0.1365, + "grad_norm": 0.8289728760719299, + "kl": 0.1731492429971695, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0069, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.28749996423721313, + "rewards/_format_reward": 1.0, + "step": 546 + }, + { + "completion_length": 161.375, + "epoch": 0.13675, + "grad_norm": 0.7212539911270142, + "kl": 0.1648021787405014, + "learning_rate": 4.979457950323945e-06, + "loss": 0.0066, + "reward": 1.28125, + "reward_std": 0.44395744800567627, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 0.875, + "step": 547 + }, + { + "completion_length": 133.25, + "epoch": 0.137, + "grad_norm": 2.3475310802459717, + "kl": 0.3754054307937622, + "learning_rate": 4.979177905957726e-06, + "loss": 0.015, + "reward": 0.78125, + "reward_std": 0.7100993394851685, + "rewards/_accuracy_reward": 0.1562499850988388, + "rewards/_format_reward": 0.625, + "step": 548 + }, + { + "completion_length": 162.25, + "epoch": 0.13725, + "grad_norm": 0.7575663328170776, + "kl": 0.1953345090150833, + "learning_rate": 4.978895973589686e-06, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 549 + }, + { + "completion_length": 175.0, + "epoch": 0.1375, + "grad_norm": 2.3825697898864746, + "kl": 0.20287609100341797, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0081, + "reward": 1.1687500476837158, + "reward_std": 0.8314949870109558, + "rewards/_accuracy_reward": 0.41874998807907104, + "rewards/_format_reward": 0.75, + "step": 550 + }, + { + "completion_length": 82.75, + "epoch": 0.13775, + "grad_norm": 1.088090419769287, + "kl": 0.24210913479328156, + "learning_rate": 4.97832644570839e-06, + "loss": 0.0097, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 551 + }, + { + "completion_length": 182.625, + "epoch": 0.138, + "grad_norm": 0.5995256304740906, + "kl": 0.1471828818321228, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0059, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.625, + "step": 552 + }, + { + "completion_length": 112.5, + "epoch": 0.13825, + "grad_norm": 0.835372805595398, + "kl": 0.12362098693847656, + "learning_rate": 4.977749368414938e-06, + "loss": 0.0049, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 553 + }, + { + "completion_length": 102.625, + "epoch": 0.1385, + "grad_norm": 0.08654724806547165, + "kl": 0.16423742473125458, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0066, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 554 + }, + { + "completion_length": 175.125, + "epoch": 0.13875, + "grad_norm": 0.7749510407447815, + "kl": 0.2203240692615509, + "learning_rate": 4.977164743467206e-06, + "loss": 0.0088, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.625, + "step": 555 + }, + { + "completion_length": 117.625, + "epoch": 0.139, + "grad_norm": 1.410696268081665, + "kl": 0.2296142429113388, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.0092, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 556 + }, + { + "completion_length": 157.625, + "epoch": 0.13925, + "grad_norm": 0.8265271186828613, + "kl": 0.2630419433116913, + "learning_rate": 4.976572572646064e-06, + "loss": 0.0105, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 557 + }, + { + "completion_length": 122.875, + "epoch": 0.1395, + "grad_norm": 0.7778673768043518, + "kl": 0.22738924622535706, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0091, + "reward": 1.506250023841858, + "reward_std": 0.7513975501060486, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.75, + "step": 558 + }, + { + "completion_length": 88.5, + "epoch": 0.13975, + "grad_norm": 0.042239658534526825, + "kl": 0.07109732180833817, + "learning_rate": 4.975972857755369e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 559 + }, + { + "completion_length": 87.125, + "epoch": 0.14, + "grad_norm": 1.4101208448410034, + "kl": 0.22331476211547852, + "learning_rate": 4.975670171853926e-06, + "loss": 0.0089, + "reward": 1.78125, + "reward_std": 0.6187184453010559, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 560 + }, + { + "completion_length": 88.625, + "epoch": 0.14025, + "grad_norm": 0.799018383026123, + "kl": 0.09630677103996277, + "learning_rate": 4.975365600621953e-06, + "loss": 0.0039, + "reward": 1.1687499284744263, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 1.0, + "step": 561 + }, + { + "completion_length": 185.0, + "epoch": 0.1405, + "grad_norm": 0.6921383738517761, + "kl": 0.14386911690235138, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0058, + "reward": 1.1875, + "reward_std": 0.5724321603775024, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 0.875, + "step": 562 + }, + { + "completion_length": 141.375, + "epoch": 0.14075, + "grad_norm": 0.7761791944503784, + "kl": 0.1153935045003891, + "learning_rate": 4.974750803095629e-06, + "loss": 0.0046, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 563 + }, + { + "completion_length": 120.25, + "epoch": 0.141, + "grad_norm": 0.2582365572452545, + "kl": 0.17987042665481567, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0072, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 564 + }, + { + "completion_length": 142.0, + "epoch": 0.14125, + "grad_norm": 1.3962867259979248, + "kl": 0.17358022928237915, + "learning_rate": 4.974128467049177e-06, + "loss": 0.0069, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 565 + }, + { + "completion_length": 140.5, + "epoch": 0.1415, + "grad_norm": 0.7365003824234009, + "kl": 0.09812232851982117, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0039, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 566 + }, + { + "completion_length": 137.375, + "epoch": 0.14175, + "grad_norm": 0.8879725337028503, + "kl": 0.2860892415046692, + "learning_rate": 4.973498594378338e-06, + "loss": 0.0114, + "reward": 1.399999976158142, + "reward_std": 0.4971346855163574, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 0.875, + "step": 567 + }, + { + "completion_length": 99.125, + "epoch": 0.142, + "grad_norm": 0.7891566157341003, + "kl": 0.12184718996286392, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0049, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 568 + }, + { + "completion_length": 81.25, + "epoch": 0.14225, + "grad_norm": 0.03716867417097092, + "kl": 0.054770611226558685, + "learning_rate": 4.972861187001815e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 569 + }, + { + "completion_length": 128.125, + "epoch": 0.1425, + "grad_norm": 1.0409587621688843, + "kl": 0.14994631707668304, + "learning_rate": 4.972539658404793e-06, + "loss": 0.006, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 570 + }, + { + "completion_length": 149.125, + "epoch": 0.14275, + "grad_norm": 0.9049003720283508, + "kl": 0.0754564180970192, + "learning_rate": 4.9722162468612625e-06, + "loss": 0.003, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 571 + }, + { + "completion_length": 171.25, + "epoch": 0.143, + "grad_norm": 0.8585514426231384, + "kl": 0.0837872251868248, + "learning_rate": 4.971890952617515e-06, + "loss": 0.0034, + "reward": 1.3762500286102295, + "reward_std": 0.915547251701355, + "rewards/_accuracy_reward": 0.6262500286102295, + "rewards/_format_reward": 0.75, + "step": 572 + }, + { + "completion_length": 81.625, + "epoch": 0.14325, + "grad_norm": 1.177064061164856, + "kl": 0.10382416099309921, + "learning_rate": 4.9715637759212775e-06, + "loss": 0.0042, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 573 + }, + { + "completion_length": 143.875, + "epoch": 0.1435, + "grad_norm": 1.1178721189498901, + "kl": 0.17203757166862488, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0069, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 574 + }, + { + "completion_length": 149.625, + "epoch": 0.14375, + "grad_norm": 0.05728991702198982, + "kl": 0.07802347093820572, + "learning_rate": 4.970903776169403e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 575 + }, + { + "completion_length": 104.125, + "epoch": 0.144, + "grad_norm": 0.9672228097915649, + "kl": 0.12484028190374374, + "learning_rate": 4.970570953616383e-06, + "loss": 0.005, + "reward": 1.4562499523162842, + "reward_std": 0.45781898498535156, + "rewards/_accuracy_reward": 0.45624998211860657, + "rewards/_format_reward": 1.0, + "step": 576 + }, + { + "completion_length": 136.625, + "epoch": 0.14425, + "grad_norm": 1.2664356231689453, + "kl": 0.21360184252262115, + "learning_rate": 4.970236249616109e-06, + "loss": 0.0085, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 577 + }, + { + "completion_length": 144.75, + "epoch": 0.1445, + "grad_norm": 1.266003966331482, + "kl": 0.10478426516056061, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0042, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 578 + }, + { + "completion_length": 145.625, + "epoch": 0.14475, + "grad_norm": 0.06886684149503708, + "kl": 0.08477036654949188, + "learning_rate": 4.9695611982947995e-06, + "loss": 0.0034, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 579 + }, + { + "completion_length": 109.0, + "epoch": 0.145, + "grad_norm": 0.047261860221624374, + "kl": 0.08284247666597366, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 580 + }, + { + "completion_length": 134.875, + "epoch": 0.14525, + "grad_norm": 1.4606877565383911, + "kl": 0.4996589720249176, + "learning_rate": 4.968878624261798e-06, + "loss": 0.02, + "reward": 1.7512500286102295, + "reward_std": 0.460603266954422, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 0.875, + "step": 581 + }, + { + "completion_length": 89.0, + "epoch": 0.1455, + "grad_norm": 1.950114369392395, + "kl": 0.1971653401851654, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0079, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 582 + }, + { + "completion_length": 84.625, + "epoch": 0.14575, + "grad_norm": 2.8406033515930176, + "kl": 0.4174186885356903, + "learning_rate": 4.968188529596342e-06, + "loss": 0.0167, + "reward": 1.34375, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.34375, + "rewards/_format_reward": 1.0, + "step": 583 + }, + { + "completion_length": 147.75, + "epoch": 0.146, + "grad_norm": 0.6163609623908997, + "kl": 0.10596763342618942, + "learning_rate": 4.96784066268247e-06, + "loss": 0.0042, + "reward": 1.1937499046325684, + "reward_std": 0.33320683240890503, + "rewards/_accuracy_reward": 0.19374999403953552, + "rewards/_format_reward": 1.0, + "step": 584 + }, + { + "completion_length": 151.875, + "epoch": 0.14625, + "grad_norm": 0.24944590032100677, + "kl": 0.09916547685861588, + "learning_rate": 4.9674909164005805e-06, + "loss": 0.004, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 585 + }, + { + "completion_length": 97.25, + "epoch": 0.1465, + "grad_norm": 0.6661841869354248, + "kl": 0.05971316248178482, + "learning_rate": 4.967139291017018e-06, + "loss": 0.0024, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 586 + }, + { + "completion_length": 177.75, + "epoch": 0.14675, + "grad_norm": 1.3881531953811646, + "kl": 0.18598943948745728, + "learning_rate": 4.966785786799564e-06, + "loss": 0.0074, + "reward": 1.6375000476837158, + "reward_std": 0.5005354285240173, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 0.875, + "step": 587 + }, + { + "completion_length": 99.375, + "epoch": 0.147, + "grad_norm": 0.04929376021027565, + "kl": 0.08404329419136047, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0034, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 588 + }, + { + "completion_length": 128.125, + "epoch": 0.14725, + "grad_norm": 0.5995571613311768, + "kl": 0.07869472354650497, + "learning_rate": 4.966073142941239e-06, + "loss": 0.0031, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 589 + }, + { + "completion_length": 123.875, + "epoch": 0.1475, + "grad_norm": 0.7187158465385437, + "kl": 0.11943908035755157, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0048, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 590 + }, + { + "completion_length": 142.875, + "epoch": 0.14775, + "grad_norm": 0.6760448217391968, + "kl": 0.06530044227838516, + "learning_rate": 4.965352986996443e-06, + "loss": 0.0026, + "reward": 1.65625, + "reward_std": 0.48065245151519775, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.75, + "step": 591 + }, + { + "completion_length": 110.125, + "epoch": 0.148, + "grad_norm": 0.5969019532203674, + "kl": 0.11454600095748901, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0046, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 592 + }, + { + "completion_length": 156.25, + "epoch": 0.14825, + "grad_norm": 0.6029123663902283, + "kl": 0.06257858872413635, + "learning_rate": 4.964625321158897e-06, + "loss": 0.0025, + "reward": 1.5, + "reward_std": 0.4225771427154541, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.875, + "step": 593 + }, + { + "completion_length": 165.875, + "epoch": 0.1485, + "grad_norm": 0.6441696882247925, + "kl": 0.0646340623497963, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0026, + "reward": 1.3825000524520874, + "reward_std": 0.7398986220359802, + "rewards/_accuracy_reward": 0.6325000524520874, + "rewards/_format_reward": 0.75, + "step": 594 + }, + { + "completion_length": 129.125, + "epoch": 0.14875, + "grad_norm": 0.026783820241689682, + "kl": 0.057939767837524414, + "learning_rate": 4.963890147645195e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 595 + }, + { + "completion_length": 108.0, + "epoch": 0.149, + "grad_norm": 1.9925416707992554, + "kl": 0.1760830134153366, + "learning_rate": 4.963519746208726e-06, + "loss": 0.007, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 596 + }, + { + "completion_length": 166.25, + "epoch": 0.14925, + "grad_norm": 1.065826416015625, + "kl": 0.05945620313286781, + "learning_rate": 4.963147468694804e-06, + "loss": 0.0024, + "reward": 0.9437500238418579, + "reward_std": 0.8537470102310181, + "rewards/_accuracy_reward": 0.3187499940395355, + "rewards/_format_reward": 0.625, + "step": 597 + }, + { + "completion_length": 154.25, + "epoch": 0.1495, + "grad_norm": 0.8419215679168701, + "kl": 0.270252525806427, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0108, + "reward": 1.506250023841858, + "reward_std": 0.7513974905014038, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.75, + "step": 598 + }, + { + "completion_length": 103.625, + "epoch": 0.14975, + "grad_norm": 0.9585555195808411, + "kl": 0.12021496146917343, + "learning_rate": 4.962397286570053e-06, + "loss": 0.0048, + "reward": 1.506250023841858, + "reward_std": 0.4144165813922882, + "rewards/_accuracy_reward": 0.5062500238418579, + "rewards/_format_reward": 1.0, + "step": 599 + }, + { + "completion_length": 206.625, + "epoch": 0.15, + "grad_norm": 0.8529797792434692, + "kl": 0.1137736588716507, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0046, + "reward": 0.7749999761581421, + "reward_std": 0.7176349759101868, + "rewards/_accuracy_reward": 0.14999999105930328, + "rewards/_format_reward": 0.625, + "step": 600 + }, + { + "completion_length": 134.0, + "epoch": 0.15025, + "grad_norm": 0.17384187877178192, + "kl": 0.08758542686700821, + "learning_rate": 4.961639603556128e-06, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 601 + }, + { + "completion_length": 135.75, + "epoch": 0.1505, + "grad_norm": 1.2760076522827148, + "kl": 0.09961305558681488, + "learning_rate": 4.961257949936092e-06, + "loss": 0.004, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 602 + }, + { + "completion_length": 163.75, + "epoch": 0.15075, + "grad_norm": 0.5337422490119934, + "kl": 0.06328251212835312, + "learning_rate": 4.96087442196106e-06, + "loss": 0.0025, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 603 + }, + { + "completion_length": 141.375, + "epoch": 0.151, + "grad_norm": 1.2539782524108887, + "kl": 0.12791743874549866, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0051, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 604 + }, + { + "completion_length": 172.875, + "epoch": 0.15125, + "grad_norm": 0.6172487139701843, + "kl": 0.07650119066238403, + "learning_rate": 4.960101744115727e-06, + "loss": 0.0031, + "reward": 1.381250023841858, + "reward_std": 0.9133679866790771, + "rewards/_accuracy_reward": 0.6312500238418579, + "rewards/_format_reward": 0.75, + "step": 605 + }, + { + "completion_length": 178.0, + "epoch": 0.1515, + "grad_norm": 0.5221604108810425, + "kl": 0.04332815483212471, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0017, + "reward": 1.5750000476837158, + "reward_std": 0.4605897068977356, + "rewards/_accuracy_reward": 0.574999988079071, + "rewards/_format_reward": 1.0, + "step": 606 + }, + { + "completion_length": 192.25, + "epoch": 0.15175, + "grad_norm": 0.47417011857032776, + "kl": 0.05454748496413231, + "learning_rate": 4.9593215723738405e-06, + "loss": 0.0022, + "reward": 1.0749999284744263, + "reward_std": 0.0707106813788414, + "rewards/_accuracy_reward": 0.07500000298023224, + "rewards/_format_reward": 1.0, + "step": 607 + }, + { + "completion_length": 151.75, + "epoch": 0.152, + "grad_norm": 0.7074270844459534, + "kl": 0.06145765259861946, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0025, + "reward": 1.5499999523162842, + "reward_std": 0.4855041801929474, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 608 + }, + { + "completion_length": 92.625, + "epoch": 0.15225, + "grad_norm": 1.0133639574050903, + "kl": 0.0878884345293045, + "learning_rate": 4.958533909111936e-06, + "loss": 0.0035, + "reward": 1.5499999523162842, + "reward_std": 0.4855041801929474, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 609 + }, + { + "completion_length": 140.75, + "epoch": 0.1525, + "grad_norm": 0.5950965881347656, + "kl": 0.050494369119405746, + "learning_rate": 4.958137268909887e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 610 + }, + { + "completion_length": 121.75, + "epoch": 0.15275, + "grad_norm": 0.8706890344619751, + "kl": 0.07878470420837402, + "learning_rate": 4.957738756729375e-06, + "loss": 0.0032, + "reward": 1.431249976158142, + "reward_std": 0.47579821944236755, + "rewards/_accuracy_reward": 0.4312500059604645, + "rewards/_format_reward": 1.0, + "step": 611 + }, + { + "completion_length": 181.25, + "epoch": 0.153, + "grad_norm": 0.4814571738243103, + "kl": 0.05293947085738182, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0021, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 612 + }, + { + "completion_length": 148.0, + "epoch": 0.15325, + "grad_norm": 0.7898260354995728, + "kl": 0.11220979690551758, + "learning_rate": 4.956936117648329e-06, + "loss": 0.0045, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 613 + }, + { + "completion_length": 96.875, + "epoch": 0.1535, + "grad_norm": 0.7721397876739502, + "kl": 0.09208115190267563, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0037, + "reward": 1.59375, + "reward_std": 0.4419417381286621, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 0.875, + "step": 614 + }, + { + "completion_length": 99.625, + "epoch": 0.15375, + "grad_norm": 0.953384518623352, + "kl": 0.07794458419084549, + "learning_rate": 4.956125994313775e-06, + "loss": 0.0031, + "reward": 1.53125, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 1.0, + "step": 615 + }, + { + "completion_length": 140.25, + "epoch": 0.154, + "grad_norm": 0.5639375448226929, + "kl": 0.09136466681957245, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0037, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 616 + }, + { + "completion_length": 159.125, + "epoch": 0.15425, + "grad_norm": 0.9821755290031433, + "kl": 0.08807244896888733, + "learning_rate": 4.955308389193489e-06, + "loss": 0.0035, + "reward": 1.431249976158142, + "reward_std": 0.47579824924468994, + "rewards/_accuracy_reward": 0.4312499761581421, + "rewards/_format_reward": 1.0, + "step": 617 + }, + { + "completion_length": 127.875, + "epoch": 0.1545, + "grad_norm": 0.900895357131958, + "kl": 0.08849354833364487, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0035, + "reward": 1.6875, + "reward_std": 0.4381372928619385, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.875, + "step": 618 + }, + { + "completion_length": 122.875, + "epoch": 0.15475, + "grad_norm": 0.10602176189422607, + "kl": 0.11338386684656143, + "learning_rate": 4.95448330477804e-06, + "loss": 0.0045, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 619 + }, + { + "completion_length": 157.625, + "epoch": 0.155, + "grad_norm": 0.620640218257904, + "kl": 0.0592242069542408, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0024, + "reward": 1.6687500476837158, + "reward_std": 0.4613160789012909, + "rewards/_accuracy_reward": 0.668749988079071, + "rewards/_format_reward": 1.0, + "step": 620 + }, + { + "completion_length": 166.0, + "epoch": 0.15525, + "grad_norm": 0.20956626534461975, + "kl": 0.13530687987804413, + "learning_rate": 4.953650743580776e-06, + "loss": 0.0054, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 621 + }, + { + "completion_length": 140.5, + "epoch": 0.1555, + "grad_norm": 0.506284236907959, + "kl": 0.07134924829006195, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0029, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 622 + }, + { + "completion_length": 155.125, + "epoch": 0.15575, + "grad_norm": 1.0659793615341187, + "kl": 0.09406277537345886, + "learning_rate": 4.952810708137824e-06, + "loss": 0.0038, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 623 + }, + { + "completion_length": 134.625, + "epoch": 0.156, + "grad_norm": 0.7838442921638489, + "kl": 0.0715455636382103, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.0029, + "reward": 1.18874990940094, + "reward_std": 0.3359607756137848, + "rewards/_accuracy_reward": 0.1887499988079071, + "rewards/_format_reward": 1.0, + "step": 624 + }, + { + "completion_length": 111.0, + "epoch": 0.15625, + "grad_norm": 0.03516482189297676, + "kl": 0.05203777924180031, + "learning_rate": 4.9519632010080765e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 625 + }, + { + "completion_length": 116.0, + "epoch": 0.1565, + "grad_norm": 1.0075751543045044, + "kl": 0.12949231266975403, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0052, + "reward": 1.21875, + "reward_std": 0.8284828662872314, + "rewards/_accuracy_reward": 0.46875, + "rewards/_format_reward": 0.75, + "step": 626 + }, + { + "completion_length": 163.25, + "epoch": 0.15675, + "grad_norm": 0.7406784296035767, + "kl": 0.08424215018749237, + "learning_rate": 4.951108224773189e-06, + "loss": 0.0034, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 627 + }, + { + "completion_length": 149.625, + "epoch": 0.157, + "grad_norm": 0.7964242100715637, + "kl": 0.10702201724052429, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0043, + "reward": 1.5199999809265137, + "reward_std": 0.5133086442947388, + "rewards/_accuracy_reward": 0.5199999809265137, + "rewards/_format_reward": 1.0, + "step": 628 + }, + { + "completion_length": 160.75, + "epoch": 0.15725, + "grad_norm": 0.8447659611701965, + "kl": 0.11376137286424637, + "learning_rate": 4.950245782037566e-06, + "loss": 0.0046, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 629 + }, + { + "completion_length": 140.375, + "epoch": 0.1575, + "grad_norm": 0.713877260684967, + "kl": 0.06790605187416077, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0027, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 630 + }, + { + "completion_length": 128.875, + "epoch": 0.15775, + "grad_norm": 0.8207094669342041, + "kl": 0.0979921892285347, + "learning_rate": 4.9493758754283575e-06, + "loss": 0.0039, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 631 + }, + { + "completion_length": 117.875, + "epoch": 0.158, + "grad_norm": 1.2843239307403564, + "kl": 0.11575803905725479, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0046, + "reward": 1.399999976158142, + "reward_std": 0.720119059085846, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 0.875, + "step": 632 + }, + { + "completion_length": 130.625, + "epoch": 0.15825, + "grad_norm": 0.7998097538948059, + "kl": 0.10970651358366013, + "learning_rate": 4.9484985075954505e-06, + "loss": 0.0044, + "reward": 1.6262500286102295, + "reward_std": 0.7428312301635742, + "rewards/_accuracy_reward": 0.7512500286102295, + "rewards/_format_reward": 0.875, + "step": 633 + }, + { + "completion_length": 86.875, + "epoch": 0.1585, + "grad_norm": 0.2539229393005371, + "kl": 0.0996176227927208, + "learning_rate": 4.948057026554415e-06, + "loss": 0.004, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 634 + }, + { + "completion_length": 79.375, + "epoch": 0.15875, + "grad_norm": 0.9077969789505005, + "kl": 0.09100610762834549, + "learning_rate": 4.94761368121146e-06, + "loss": 0.0036, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 635 + }, + { + "completion_length": 78.75, + "epoch": 0.159, + "grad_norm": 0.9096614718437195, + "kl": 0.10145445168018341, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0041, + "reward": 1.3875000476837158, + "reward_std": 0.3879893124103546, + "rewards/_accuracy_reward": 0.38749998807907104, + "rewards/_format_reward": 1.0, + "step": 636 + }, + { + "completion_length": 131.375, + "epoch": 0.15925, + "grad_norm": 0.593658983707428, + "kl": 0.08535154908895493, + "learning_rate": 4.94672139897172e-06, + "loss": 0.0034, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 637 + }, + { + "completion_length": 140.125, + "epoch": 0.1595, + "grad_norm": 0.05550685152411461, + "kl": 0.09320636093616486, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0037, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 638 + }, + { + "completion_length": 115.125, + "epoch": 0.15975, + "grad_norm": 0.0472259521484375, + "kl": 0.08565588295459747, + "learning_rate": 4.945821663594277e-06, + "loss": 0.0034, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 639 + }, + { + "completion_length": 109.75, + "epoch": 0.16, + "grad_norm": 1.4830896854400635, + "kl": 0.15049146115779877, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.006, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 640 + }, + { + "completion_length": 108.0, + "epoch": 0.16025, + "grad_norm": 0.041197896003723145, + "kl": 0.1003151386976242, + "learning_rate": 4.944914477819881e-06, + "loss": 0.004, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 641 + }, + { + "completion_length": 128.25, + "epoch": 0.1605, + "grad_norm": 0.6992077827453613, + "kl": 0.12279914319515228, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0049, + "reward": 1.3125, + "reward_std": 0.4299086630344391, + "rewards/_accuracy_reward": 0.3124999701976776, + "rewards/_format_reward": 1.0, + "step": 642 + }, + { + "completion_length": 58.25, + "epoch": 0.16075, + "grad_norm": 1.5356488227844238, + "kl": 0.13889075815677643, + "learning_rate": 4.943999844411978e-06, + "loss": 0.0056, + "reward": 1.068750023841858, + "reward_std": 0.8936032652854919, + "rewards/_accuracy_reward": 0.4437499940395355, + "rewards/_format_reward": 0.625, + "step": 643 + }, + { + "completion_length": 142.875, + "epoch": 0.161, + "grad_norm": 0.6315851211547852, + "kl": 0.07377047091722488, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.003, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 644 + }, + { + "completion_length": 115.25, + "epoch": 0.16125, + "grad_norm": 1.5424988269805908, + "kl": 0.09540820121765137, + "learning_rate": 4.943077766156698e-06, + "loss": 0.0038, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 645 + }, + { + "completion_length": 92.875, + "epoch": 0.1615, + "grad_norm": 0.86021488904953, + "kl": 0.13427940011024475, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0054, + "reward": 1.4375, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.4375, + "rewards/_format_reward": 1.0, + "step": 646 + }, + { + "completion_length": 91.0, + "epoch": 0.16175, + "grad_norm": 0.054148320108652115, + "kl": 0.06728003174066544, + "learning_rate": 4.942148245862852e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 647 + }, + { + "completion_length": 95.5, + "epoch": 0.162, + "grad_norm": 0.03492613136768341, + "kl": 0.09255649149417877, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.0037, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 648 + }, + { + "completion_length": 166.5, + "epoch": 0.16225, + "grad_norm": 0.9836525917053223, + "kl": 0.08547134697437286, + "learning_rate": 4.941211286361922e-06, + "loss": 0.0034, + "reward": 1.756250023841858, + "reward_std": 0.6894291639328003, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 649 + }, + { + "completion_length": 166.875, + "epoch": 0.1625, + "grad_norm": 1.5894322395324707, + "kl": 0.11034282296895981, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0044, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 650 + }, + { + "completion_length": 76.625, + "epoch": 0.16275, + "grad_norm": 0.8325356841087341, + "kl": 0.09968439489603043, + "learning_rate": 4.940266890508048e-06, + "loss": 0.004, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 651 + }, + { + "completion_length": 84.25, + "epoch": 0.163, + "grad_norm": 0.9667750000953674, + "kl": 0.08716170489788055, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0035, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 652 + }, + { + "completion_length": 152.5, + "epoch": 0.16325, + "grad_norm": 0.15192466974258423, + "kl": 0.11996634304523468, + "learning_rate": 4.9393150611780215e-06, + "loss": 0.0048, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 653 + }, + { + "completion_length": 136.5, + "epoch": 0.1635, + "grad_norm": 0.6134188771247864, + "kl": 0.0701427310705185, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0028, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 654 + }, + { + "completion_length": 169.75, + "epoch": 0.16375, + "grad_norm": 0.5714187622070312, + "kl": 0.0795215517282486, + "learning_rate": 4.938355801271282e-06, + "loss": 0.0032, + "reward": 1.5499999523162842, + "reward_std": 0.4855042099952698, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 655 + }, + { + "completion_length": 88.125, + "epoch": 0.164, + "grad_norm": 1.2346508502960205, + "kl": 0.3160145878791809, + "learning_rate": 4.937873385763909e-06, + "loss": 0.0126, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 656 + }, + { + "completion_length": 86.25, + "epoch": 0.16425, + "grad_norm": 0.9534627199172974, + "kl": 0.07120765745639801, + "learning_rate": 4.937389113709902e-06, + "loss": 0.0028, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 657 + }, + { + "completion_length": 112.5, + "epoch": 0.1645, + "grad_norm": 0.7603535652160645, + "kl": 0.1263781189918518, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0051, + "reward": 1.1375000476837158, + "reward_std": 0.548211395740509, + "rewards/_accuracy_reward": 0.26249998807907104, + "rewards/_format_reward": 0.875, + "step": 658 + }, + { + "completion_length": 124.125, + "epoch": 0.16475, + "grad_norm": 1.240847110748291, + "kl": 0.10992362350225449, + "learning_rate": 4.936415001438577e-06, + "loss": 0.0044, + "reward": 1.537500023841858, + "reward_std": 0.7322909235954285, + "rewards/_accuracy_reward": 0.6625000238418579, + "rewards/_format_reward": 0.875, + "step": 659 + }, + { + "completion_length": 127.625, + "epoch": 0.165, + "grad_norm": 0.9347513914108276, + "kl": 0.10446102917194366, + "learning_rate": 4.935925161963089e-06, + "loss": 0.0042, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.875, + "step": 660 + }, + { + "completion_length": 157.5, + "epoch": 0.16525, + "grad_norm": 1.6045786142349243, + "kl": 0.11034439504146576, + "learning_rate": 4.935433467424624e-06, + "loss": 0.0044, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 661 + }, + { + "completion_length": 123.125, + "epoch": 0.1655, + "grad_norm": 1.0717341899871826, + "kl": 0.10492967069149017, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0042, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 662 + }, + { + "completion_length": 132.125, + "epoch": 0.16575, + "grad_norm": 0.7541182637214661, + "kl": 0.06509540975093842, + "learning_rate": 4.934444514657964e-06, + "loss": 0.0026, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 663 + }, + { + "completion_length": 150.375, + "epoch": 0.166, + "grad_norm": 0.8722965717315674, + "kl": 0.09795466810464859, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0039, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 664 + }, + { + "completion_length": 162.5, + "epoch": 0.16625, + "grad_norm": 0.7149991393089294, + "kl": 0.07934553176164627, + "learning_rate": 4.933448146151122e-06, + "loss": 0.0032, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.625, + "step": 665 + }, + { + "completion_length": 155.5, + "epoch": 0.1665, + "grad_norm": 1.3303704261779785, + "kl": 0.12056293338537216, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0048, + "reward": 1.2950000762939453, + "reward_std": 0.6939946413040161, + "rewards/_accuracy_reward": 0.41999998688697815, + "rewards/_format_reward": 0.875, + "step": 666 + }, + { + "completion_length": 183.0, + "epoch": 0.16675, + "grad_norm": 0.6829738616943359, + "kl": 0.09543811529874802, + "learning_rate": 4.932444364939205e-06, + "loss": 0.0038, + "reward": 1.1687500476837158, + "reward_std": 0.8314949870109558, + "rewards/_accuracy_reward": 0.41874998807907104, + "rewards/_format_reward": 0.75, + "step": 667 + }, + { + "completion_length": 124.375, + "epoch": 0.167, + "grad_norm": 1.0810277462005615, + "kl": 0.11649039387702942, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0047, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 668 + }, + { + "completion_length": 184.875, + "epoch": 0.16725, + "grad_norm": 1.8912442922592163, + "kl": 0.2582206428050995, + "learning_rate": 4.9314331740799084e-06, + "loss": 0.0103, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 669 + }, + { + "completion_length": 135.875, + "epoch": 0.1675, + "grad_norm": 0.932590901851654, + "kl": 0.13341131806373596, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0053, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 670 + }, + { + "completion_length": 92.5, + "epoch": 0.16775, + "grad_norm": 0.3577817976474762, + "kl": 0.11805645376443863, + "learning_rate": 4.930414576653492e-06, + "loss": 0.0047, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 671 + }, + { + "completion_length": 183.25, + "epoch": 0.168, + "grad_norm": 2.2905547618865967, + "kl": 0.2760542035102844, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.011, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 672 + }, + { + "completion_length": 153.75, + "epoch": 0.16825, + "grad_norm": 0.8730438947677612, + "kl": 0.1262456476688385, + "learning_rate": 4.9293885757627815e-06, + "loss": 0.005, + "reward": 1.6875, + "reward_std": 0.4381372928619385, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.875, + "step": 673 + }, + { + "completion_length": 141.375, + "epoch": 0.1685, + "grad_norm": 0.07061377912759781, + "kl": 0.09641307592391968, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0039, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 674 + }, + { + "completion_length": 144.5, + "epoch": 0.16875, + "grad_norm": 0.8018800020217896, + "kl": 0.19013966619968414, + "learning_rate": 4.928355174533153e-06, + "loss": 0.0076, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 675 + }, + { + "completion_length": 171.625, + "epoch": 0.169, + "grad_norm": 1.220819115638733, + "kl": 0.15096057951450348, + "learning_rate": 4.92783569977409e-06, + "loss": 0.006, + "reward": 1.149999976158142, + "reward_std": 0.6358346939086914, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.75, + "step": 676 + }, + { + "completion_length": 125.75, + "epoch": 0.16925, + "grad_norm": 1.0009835958480835, + "kl": 0.09804116189479828, + "learning_rate": 4.927314376112528e-06, + "loss": 0.0039, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 677 + }, + { + "completion_length": 155.875, + "epoch": 0.1695, + "grad_norm": 0.7764220833778381, + "kl": 0.12891684472560883, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0052, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 678 + }, + { + "completion_length": 139.25, + "epoch": 0.16975, + "grad_norm": 0.6513742804527283, + "kl": 0.05859963223338127, + "learning_rate": 4.926266183671356e-06, + "loss": 0.0023, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 679 + }, + { + "completion_length": 82.25, + "epoch": 0.17, + "grad_norm": 1.309946894645691, + "kl": 0.12626594305038452, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0051, + "reward": 1.412500023841858, + "reward_std": 0.3691205680370331, + "rewards/_accuracy_reward": 0.4124999940395355, + "rewards/_format_reward": 1.0, + "step": 680 + }, + { + "completion_length": 107.75, + "epoch": 0.17025, + "grad_norm": 1.0792083740234375, + "kl": 0.1906372606754303, + "learning_rate": 4.925210600402615e-06, + "loss": 0.0076, + "reward": 1.537500023841858, + "reward_std": 0.7322909235954285, + "rewards/_accuracy_reward": 0.6625000238418579, + "rewards/_format_reward": 0.875, + "step": 681 + }, + { + "completion_length": 143.25, + "epoch": 0.1705, + "grad_norm": 0.5776306986808777, + "kl": 0.05421634390950203, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0022, + "reward": 1.7825000286102295, + "reward_std": 0.40780770778656006, + "rewards/_accuracy_reward": 0.7825000286102295, + "rewards/_format_reward": 1.0, + "step": 682 + }, + { + "completion_length": 99.625, + "epoch": 0.17075, + "grad_norm": 1.0922069549560547, + "kl": 0.16642846167087555, + "learning_rate": 4.924147629521794e-06, + "loss": 0.0067, + "reward": 1.4500000476837158, + "reward_std": 0.6979562044143677, + "rewards/_accuracy_reward": 0.574999988079071, + "rewards/_format_reward": 0.875, + "step": 683 + }, + { + "completion_length": 157.375, + "epoch": 0.171, + "grad_norm": 0.7435649633407593, + "kl": 0.08429885655641556, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0034, + "reward": 1.306249976158142, + "reward_std": 0.4346078038215637, + "rewards/_accuracy_reward": 0.4312499761581421, + "rewards/_format_reward": 0.875, + "step": 684 + }, + { + "completion_length": 140.0, + "epoch": 0.17125, + "grad_norm": 0.8662862181663513, + "kl": 0.06566808372735977, + "learning_rate": 4.923077274266886e-06, + "loss": 0.0026, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 685 + }, + { + "completion_length": 148.0, + "epoch": 0.1715, + "grad_norm": 1.3568053245544434, + "kl": 0.11993306875228882, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0048, + "reward": 1.7512500286102295, + "reward_std": 0.460603266954422, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 0.875, + "step": 686 + }, + { + "completion_length": 116.75, + "epoch": 0.17175, + "grad_norm": 0.056372012943029404, + "kl": 0.11503936350345612, + "learning_rate": 4.92199953789838e-06, + "loss": 0.0046, + "reward": 1.25, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.25, + "rewards/_format_reward": 1.0, + "step": 687 + }, + { + "completion_length": 110.5, + "epoch": 0.172, + "grad_norm": 0.8631861209869385, + "kl": 0.07430507987737656, + "learning_rate": 4.921457902821578e-06, + "loss": 0.003, + "reward": 1.1937499046325684, + "reward_std": 0.33320683240890503, + "rewards/_accuracy_reward": 0.19374999403953552, + "rewards/_format_reward": 1.0, + "step": 688 + }, + { + "completion_length": 85.875, + "epoch": 0.17225, + "grad_norm": 0.04333508759737015, + "kl": 0.11433293670415878, + "learning_rate": 4.920914423699247e-06, + "loss": 0.0046, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 689 + }, + { + "completion_length": 159.75, + "epoch": 0.1725, + "grad_norm": 0.737657368183136, + "kl": 0.07528231292963028, + "learning_rate": 4.92036910094527e-06, + "loss": 0.003, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 690 + }, + { + "completion_length": 120.25, + "epoch": 0.17275, + "grad_norm": 0.8405797481536865, + "kl": 0.07952480018138885, + "learning_rate": 4.919821934974933e-06, + "loss": 0.0032, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 691 + }, + { + "completion_length": 132.5, + "epoch": 0.173, + "grad_norm": 0.6791375279426575, + "kl": 0.0799427479505539, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0032, + "reward": 1.7575000524520874, + "reward_std": 0.449150025844574, + "rewards/_accuracy_reward": 0.7574999928474426, + "rewards/_format_reward": 1.0, + "step": 692 + }, + { + "completion_length": 122.875, + "epoch": 0.17325, + "grad_norm": 0.7748632431030273, + "kl": 0.11183080077171326, + "learning_rate": 4.918722075053349e-06, + "loss": 0.0045, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 693 + }, + { + "completion_length": 100.375, + "epoch": 0.1735, + "grad_norm": 0.9572948217391968, + "kl": 0.0853419229388237, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0034, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 694 + }, + { + "completion_length": 158.875, + "epoch": 0.17375, + "grad_norm": 0.7936824560165405, + "kl": 0.062092866748571396, + "learning_rate": 4.917614847284858e-06, + "loss": 0.0025, + "reward": 1.3125, + "reward_std": 0.4299086630344391, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 1.0, + "step": 695 + }, + { + "completion_length": 147.5, + "epoch": 0.174, + "grad_norm": 0.05352199077606201, + "kl": 0.0950603038072586, + "learning_rate": 4.917058471511149e-06, + "loss": 0.0038, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 696 + }, + { + "completion_length": 189.875, + "epoch": 0.17425, + "grad_norm": 0.8454087972640991, + "kl": 0.14175119996070862, + "learning_rate": 4.916500255042269e-06, + "loss": 0.0057, + "reward": 1.381250023841858, + "reward_std": 0.9133679866790771, + "rewards/_accuracy_reward": 0.6312500238418579, + "rewards/_format_reward": 0.75, + "step": 697 + }, + { + "completion_length": 138.125, + "epoch": 0.1745, + "grad_norm": 0.7943523526191711, + "kl": 0.09450940042734146, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0038, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 698 + }, + { + "completion_length": 139.375, + "epoch": 0.17475, + "grad_norm": 0.6788039803504944, + "kl": 0.06518439203500748, + "learning_rate": 4.915378301720822e-06, + "loss": 0.0026, + "reward": 1.3875000476837158, + "reward_std": 0.3879893124103546, + "rewards/_accuracy_reward": 0.38749998807907104, + "rewards/_format_reward": 1.0, + "step": 699 + }, + { + "completion_length": 116.875, + "epoch": 0.175, + "grad_norm": 0.8311275243759155, + "kl": 0.08171775192022324, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0033, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 700 + }, + { + "completion_length": 150.0, + "epoch": 0.17525, + "grad_norm": 0.5767417550086975, + "kl": 0.06639153510332108, + "learning_rate": 4.914248990738182e-06, + "loss": 0.0027, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 701 + }, + { + "completion_length": 165.25, + "epoch": 0.1755, + "grad_norm": 0.6517013311386108, + "kl": 0.1013181060552597, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0041, + "reward": 1.1624999046325684, + "reward_std": 0.338853120803833, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 0.875, + "step": 702 + }, + { + "completion_length": 113.625, + "epoch": 0.17575, + "grad_norm": 0.9347190260887146, + "kl": 0.09112431108951569, + "learning_rate": 4.913112325534426e-06, + "loss": 0.0036, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 703 + }, + { + "completion_length": 97.75, + "epoch": 0.176, + "grad_norm": 0.05434059351682663, + "kl": 0.09588680416345596, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0038, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 704 + }, + { + "completion_length": 149.625, + "epoch": 0.17625, + "grad_norm": 0.0285491980612278, + "kl": 0.0574335977435112, + "learning_rate": 4.9119683095720325e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 705 + }, + { + "completion_length": 130.625, + "epoch": 0.1765, + "grad_norm": 0.8181139826774597, + "kl": 0.06118696555495262, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.0024, + "reward": 1.6637499332427979, + "reward_std": 0.4691310524940491, + "rewards/_accuracy_reward": 0.6637499928474426, + "rewards/_format_reward": 1.0, + "step": 706 + }, + { + "completion_length": 139.75, + "epoch": 0.17675, + "grad_norm": 0.4688571095466614, + "kl": 0.058115698397159576, + "learning_rate": 4.910816946335875e-06, + "loss": 0.0023, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 707 + }, + { + "completion_length": 164.0, + "epoch": 0.177, + "grad_norm": 0.6474580764770508, + "kl": 0.16749340295791626, + "learning_rate": 4.910238510585275e-06, + "loss": 0.0067, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.518750011920929, + "rewards/_format_reward": 0.875, + "step": 708 + }, + { + "completion_length": 174.25, + "epoch": 0.17725, + "grad_norm": 0.6559945940971375, + "kl": 0.11057315766811371, + "learning_rate": 4.909658239333203e-06, + "loss": 0.0044, + "reward": 1.2687499523162842, + "reward_std": 0.699968159198761, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.75, + "step": 709 + }, + { + "completion_length": 160.25, + "epoch": 0.1775, + "grad_norm": 0.049524981528520584, + "kl": 0.06787683814764023, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0027, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 710 + }, + { + "completion_length": 180.5, + "epoch": 0.17775, + "grad_norm": 0.7570251822471619, + "kl": 0.08942964673042297, + "learning_rate": 4.9084921920936405e-06, + "loss": 0.0036, + "reward": 0.9187499284744263, + "reward_std": 0.3712310194969177, + "rewards/_accuracy_reward": 0.04374999925494194, + "rewards/_format_reward": 0.875, + "step": 711 + }, + { + "completion_length": 163.625, + "epoch": 0.178, + "grad_norm": 0.6288597583770752, + "kl": 0.07151935994625092, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0029, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 712 + }, + { + "completion_length": 132.625, + "epoch": 0.17825, + "grad_norm": 0.044216256588697433, + "kl": 0.07765813916921616, + "learning_rate": 4.907318808169168e-06, + "loss": 0.0031, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 713 + }, + { + "completion_length": 190.0, + "epoch": 0.1785, + "grad_norm": 0.9514076709747314, + "kl": 0.0776422843337059, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0031, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.625, + "step": 714 + }, + { + "completion_length": 82.625, + "epoch": 0.17875, + "grad_norm": 0.046869371086359024, + "kl": 0.06022655963897705, + "learning_rate": 4.906138091134118e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 715 + }, + { + "completion_length": 141.625, + "epoch": 0.179, + "grad_norm": 1.30816650390625, + "kl": 0.0730072632431984, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0029, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 716 + }, + { + "completion_length": 109.5, + "epoch": 0.17925, + "grad_norm": 0.7696007490158081, + "kl": 0.08097223192453384, + "learning_rate": 4.904950044585159e-06, + "loss": 0.0032, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 717 + }, + { + "completion_length": 163.0, + "epoch": 0.1795, + "grad_norm": 0.674775242805481, + "kl": 0.06911822408437729, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0028, + "reward": 1.28125, + "reward_std": 0.6844902038574219, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 0.875, + "step": 718 + }, + { + "completion_length": 176.25, + "epoch": 0.17975, + "grad_norm": 0.5623289346694946, + "kl": 0.05954327434301376, + "learning_rate": 4.903754672141288e-06, + "loss": 0.0024, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 719 + }, + { + "completion_length": 147.5, + "epoch": 0.18, + "grad_norm": 0.7491151690483093, + "kl": 0.08063170313835144, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0032, + "reward": 1.4212499856948853, + "reward_std": 0.485222727060318, + "rewards/_accuracy_reward": 0.42124998569488525, + "rewards/_format_reward": 1.0, + "step": 720 + }, + { + "completion_length": 147.625, + "epoch": 0.18025, + "grad_norm": 0.0454736165702343, + "kl": 0.06635451316833496, + "learning_rate": 4.902551977443813e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 721 + }, + { + "completion_length": 196.875, + "epoch": 0.1805, + "grad_norm": 0.5895596146583557, + "kl": 0.11441276967525482, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0046, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 722 + }, + { + "completion_length": 169.0, + "epoch": 0.18075, + "grad_norm": 0.07282302528619766, + "kl": 0.09880199283361435, + "learning_rate": 4.901341964156356e-06, + "loss": 0.004, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 723 + }, + { + "completion_length": 109.875, + "epoch": 0.181, + "grad_norm": 0.036335721611976624, + "kl": 0.05390893295407295, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 724 + }, + { + "completion_length": 130.0, + "epoch": 0.18125, + "grad_norm": 0.9860284924507141, + "kl": 0.10044866800308228, + "learning_rate": 4.900124635964823e-06, + "loss": 0.004, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 725 + }, + { + "completion_length": 162.625, + "epoch": 0.1815, + "grad_norm": 0.8596735596656799, + "kl": 0.09692024439573288, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0039, + "reward": 1.6375000476837158, + "reward_std": 0.7224709987640381, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 0.875, + "step": 726 + }, + { + "completion_length": 102.25, + "epoch": 0.18175, + "grad_norm": 0.039204664528369904, + "kl": 0.07592302560806274, + "learning_rate": 4.898899996577407e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 727 + }, + { + "completion_length": 131.5, + "epoch": 0.182, + "grad_norm": 0.8768433332443237, + "kl": 0.07269235700368881, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0029, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 728 + }, + { + "completion_length": 108.5, + "epoch": 0.18225, + "grad_norm": 0.023332836106419563, + "kl": 0.03946740925312042, + "learning_rate": 4.897668049724574e-06, + "loss": 0.0016, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 729 + }, + { + "completion_length": 187.0, + "epoch": 0.1825, + "grad_norm": 0.7027626633644104, + "kl": 0.09945539385080338, + "learning_rate": 4.897049337170483e-06, + "loss": 0.004, + "reward": 1.181249976158142, + "reward_std": 0.6335486769676208, + "rewards/_accuracy_reward": 0.3062499761581421, + "rewards/_format_reward": 0.875, + "step": 730 + }, + { + "completion_length": 167.375, + "epoch": 0.18275, + "grad_norm": 0.029858523979783058, + "kl": 0.04215020686388016, + "learning_rate": 4.896428799159048e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 731 + }, + { + "completion_length": 108.625, + "epoch": 0.183, + "grad_norm": 0.8350287675857544, + "kl": 0.058248016983270645, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0023, + "reward": 1.2000000476837158, + "reward_std": 0.09258202463388443, + "rewards/_accuracy_reward": 0.20000000298023224, + "rewards/_format_reward": 1.0, + "step": 732 + }, + { + "completion_length": 153.25, + "epoch": 0.18325, + "grad_norm": 0.8019189238548279, + "kl": 0.06479734927415848, + "learning_rate": 4.8951822486557985e-06, + "loss": 0.0026, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 733 + }, + { + "completion_length": 117.0, + "epoch": 0.1835, + "grad_norm": 0.0786304697394371, + "kl": 0.07490724325180054, + "learning_rate": 4.894556237113287e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 734 + }, + { + "completion_length": 153.25, + "epoch": 0.18375, + "grad_norm": 0.7346017360687256, + "kl": 0.0502316989004612, + "learning_rate": 4.8939284020120365e-06, + "loss": 0.002, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 735 + }, + { + "completion_length": 113.625, + "epoch": 0.184, + "grad_norm": 0.7519781589508057, + "kl": 0.06321275234222412, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0025, + "reward": 1.4500000476837158, + "reward_std": 0.46445053815841675, + "rewards/_accuracy_reward": 0.574999988079071, + "rewards/_format_reward": 0.875, + "step": 736 + }, + { + "completion_length": 176.75, + "epoch": 0.18425, + "grad_norm": 0.5180202126502991, + "kl": 0.06774277240037918, + "learning_rate": 4.892667263047196e-06, + "loss": 0.0027, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 737 + }, + { + "completion_length": 188.75, + "epoch": 0.1845, + "grad_norm": 0.5825796723365784, + "kl": 0.05548809841275215, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0022, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 738 + }, + { + "completion_length": 146.625, + "epoch": 0.18475, + "grad_norm": 0.9164865016937256, + "kl": 0.07509761303663254, + "learning_rate": 4.891398835602925e-06, + "loss": 0.003, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 739 + }, + { + "completion_length": 140.125, + "epoch": 0.185, + "grad_norm": 0.8452046513557434, + "kl": 0.054703302681446075, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0022, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 740 + }, + { + "completion_length": 99.625, + "epoch": 0.18525, + "grad_norm": 0.051303569227457047, + "kl": 0.08967574685811996, + "learning_rate": 4.890123123543074e-06, + "loss": 0.0036, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 741 + }, + { + "completion_length": 142.875, + "epoch": 0.1855, + "grad_norm": 0.7844648361206055, + "kl": 0.057571351528167725, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0023, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 742 + }, + { + "completion_length": 126.5, + "epoch": 0.18575, + "grad_norm": 0.8076685667037964, + "kl": 0.07212512940168381, + "learning_rate": 4.888840130753681e-06, + "loss": 0.0029, + "reward": 1.0374999046325684, + "reward_std": 0.5350233912467957, + "rewards/_accuracy_reward": 0.16249999403953552, + "rewards/_format_reward": 0.875, + "step": 743 + }, + { + "completion_length": 102.125, + "epoch": 0.186, + "grad_norm": 0.02857026271522045, + "kl": 0.045980606228113174, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 744 + }, + { + "completion_length": 102.625, + "epoch": 0.18625, + "grad_norm": 0.04947218671441078, + "kl": 0.08267652988433838, + "learning_rate": 4.887549861142967e-06, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 745 + }, + { + "completion_length": 99.5, + "epoch": 0.1865, + "grad_norm": 1.0107591152191162, + "kl": 0.07869725674390793, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0031, + "reward": 1.34375, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.34375, + "rewards/_format_reward": 1.0, + "step": 746 + }, + { + "completion_length": 148.0, + "epoch": 0.18675, + "grad_norm": 0.8579306602478027, + "kl": 0.06560367345809937, + "learning_rate": 4.886252318641316e-06, + "loss": 0.0026, + "reward": 1.125, + "reward_std": 0.10350988060235977, + "rewards/_accuracy_reward": 0.125, + "rewards/_format_reward": 1.0, + "step": 747 + }, + { + "completion_length": 170.875, + "epoch": 0.187, + "grad_norm": 0.5661394596099854, + "kl": 0.08825960755348206, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0035, + "reward": 1.274999976158142, + "reward_std": 0.6974443197250366, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.875, + "step": 748 + }, + { + "completion_length": 137.0, + "epoch": 0.18725, + "grad_norm": 0.8725441694259644, + "kl": 0.048608239740133286, + "learning_rate": 4.884947507201268e-06, + "loss": 0.0019, + "reward": 1.0437499284744263, + "reward_std": 0.5212878584861755, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 0.875, + "step": 749 + }, + { + "completion_length": 85.125, + "epoch": 0.1875, + "grad_norm": 1.9548423290252686, + "kl": 0.10041142255067825, + "learning_rate": 4.884292376870567e-06, + "loss": 0.004, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 750 + }, + { + "completion_length": 143.0, + "epoch": 0.18775, + "grad_norm": 0.8026706576347351, + "kl": 0.060220833867788315, + "learning_rate": 4.883635430797503e-06, + "loss": 0.0024, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 751 + }, + { + "completion_length": 151.125, + "epoch": 0.188, + "grad_norm": 0.5478430986404419, + "kl": 0.051096536219120026, + "learning_rate": 4.882976669482368e-06, + "loss": 0.002, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 752 + }, + { + "completion_length": 117.5, + "epoch": 0.18825, + "grad_norm": 0.6958257555961609, + "kl": 0.048697832971811295, + "learning_rate": 4.8823160934268365e-06, + "loss": 0.0019, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 753 + }, + { + "completion_length": 147.25, + "epoch": 0.1885, + "grad_norm": 0.6645660996437073, + "kl": 0.0725683867931366, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0029, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.875, + "step": 754 + }, + { + "completion_length": 147.125, + "epoch": 0.18875, + "grad_norm": 0.639782726764679, + "kl": 0.047644682228565216, + "learning_rate": 4.880989499108196e-06, + "loss": 0.0019, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 1.0, + "step": 755 + }, + { + "completion_length": 176.875, + "epoch": 0.189, + "grad_norm": 0.6647182106971741, + "kl": 0.07348424941301346, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0029, + "reward": 1.6324999332427979, + "reward_std": 0.7272404432296753, + "rewards/_accuracy_reward": 0.7575000524520874, + "rewards/_format_reward": 0.875, + "step": 756 + }, + { + "completion_length": 144.5, + "epoch": 0.18925, + "grad_norm": 0.04022838547825813, + "kl": 0.06717957556247711, + "learning_rate": 4.8796556518826196e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 757 + }, + { + "completion_length": 170.625, + "epoch": 0.1895, + "grad_norm": 0.6884375810623169, + "kl": 0.04896247014403343, + "learning_rate": 4.878986009698596e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 758 + }, + { + "completion_length": 153.5, + "epoch": 0.18975, + "grad_norm": 0.7841867804527283, + "kl": 0.07170876115560532, + "learning_rate": 4.878314555813237e-06, + "loss": 0.0029, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 759 + }, + { + "completion_length": 141.0, + "epoch": 0.19, + "grad_norm": 0.7056874632835388, + "kl": 0.0750083476305008, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.003, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 760 + }, + { + "completion_length": 130.25, + "epoch": 0.19025, + "grad_norm": 0.884601354598999, + "kl": 0.0460171103477478, + "learning_rate": 4.876966214985259e-06, + "loss": 0.0018, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 761 + }, + { + "completion_length": 107.625, + "epoch": 0.1905, + "grad_norm": 0.6765369176864624, + "kl": 0.06972219794988632, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0028, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 762 + }, + { + "completion_length": 145.625, + "epoch": 0.19075, + "grad_norm": 0.043741121888160706, + "kl": 0.06062021851539612, + "learning_rate": 4.875610633505965e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 763 + }, + { + "completion_length": 131.375, + "epoch": 0.191, + "grad_norm": 0.5471161603927612, + "kl": 0.03881002590060234, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0016, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 764 + }, + { + "completion_length": 91.25, + "epoch": 0.19125, + "grad_norm": 0.7647479176521301, + "kl": 0.0523533895611763, + "learning_rate": 4.874247815504693e-06, + "loss": 0.0021, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 765 + }, + { + "completion_length": 160.5, + "epoch": 0.1915, + "grad_norm": 0.6481146216392517, + "kl": 0.052776042371988297, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0021, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.875, + "step": 766 + }, + { + "completion_length": 165.75, + "epoch": 0.19175, + "grad_norm": 0.790132999420166, + "kl": 0.07757475972175598, + "learning_rate": 4.872877765132822e-06, + "loss": 0.0031, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 767 + }, + { + "completion_length": 214.125, + "epoch": 0.192, + "grad_norm": 0.4553297460079193, + "kl": 0.06625451147556305, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0027, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 768 + }, + { + "completion_length": 136.625, + "epoch": 0.19225, + "grad_norm": 0.7693415880203247, + "kl": 0.06646943092346191, + "learning_rate": 4.8715004865637616e-06, + "loss": 0.0027, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 769 + }, + { + "completion_length": 171.875, + "epoch": 0.1925, + "grad_norm": 0.7193809747695923, + "kl": 0.050990864634513855, + "learning_rate": 4.870809138015499e-06, + "loss": 0.002, + "reward": 1.2999999523162842, + "reward_std": 0.6917885541915894, + "rewards/_accuracy_reward": 0.42499998211860657, + "rewards/_format_reward": 0.875, + "step": 770 + }, + { + "completion_length": 159.0, + "epoch": 0.19275, + "grad_norm": 0.8034641742706299, + "kl": 0.056241609156131744, + "learning_rate": 4.870115983992944e-06, + "loss": 0.0022, + "reward": 1.7825000286102295, + "reward_std": 0.40780770778656006, + "rewards/_accuracy_reward": 0.7825000286102295, + "rewards/_format_reward": 1.0, + "step": 771 + }, + { + "completion_length": 180.5, + "epoch": 0.193, + "grad_norm": 0.02887933887541294, + "kl": 0.05732205510139465, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 772 + }, + { + "completion_length": 177.25, + "epoch": 0.19325, + "grad_norm": 0.03681923449039459, + "kl": 0.05520891398191452, + "learning_rate": 4.8687242616378026e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 773 + }, + { + "completion_length": 145.5, + "epoch": 0.1935, + "grad_norm": 0.7011691331863403, + "kl": 0.060724180191755295, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0024, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 774 + }, + { + "completion_length": 153.875, + "epoch": 0.19375, + "grad_norm": 0.8208072185516357, + "kl": 0.06594527512788773, + "learning_rate": 4.867325323737765e-06, + "loss": 0.0026, + "reward": 1.662500023841858, + "reward_std": 0.7024192810058594, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 0.875, + "step": 775 + }, + { + "completion_length": 73.5, + "epoch": 0.194, + "grad_norm": 1.4499397277832031, + "kl": 0.07009965926408768, + "learning_rate": 4.866623150289241e-06, + "loss": 0.0028, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 776 + }, + { + "completion_length": 145.875, + "epoch": 0.19425, + "grad_norm": 0.7058715224266052, + "kl": 0.04987990856170654, + "learning_rate": 4.865919174554238e-06, + "loss": 0.002, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 777 + }, + { + "completion_length": 171.375, + "epoch": 0.1945, + "grad_norm": 0.552635908126831, + "kl": 0.06489388644695282, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0026, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.2812499701976776, + "rewards/_format_reward": 0.875, + "step": 778 + }, + { + "completion_length": 115.5, + "epoch": 0.19475, + "grad_norm": 0.7429036498069763, + "kl": 0.03541000187397003, + "learning_rate": 4.8645058183705976e-06, + "loss": 0.0014, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 779 + }, + { + "completion_length": 139.375, + "epoch": 0.195, + "grad_norm": 0.6708275675773621, + "kl": 0.0737345740199089, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0029, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 780 + }, + { + "completion_length": 80.625, + "epoch": 0.19525, + "grad_norm": 1.0399796962738037, + "kl": 0.06832250952720642, + "learning_rate": 4.863085259492171e-06, + "loss": 0.0027, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 781 + }, + { + "completion_length": 84.5, + "epoch": 0.1955, + "grad_norm": 0.02351105399429798, + "kl": 0.053186241537332535, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 782 + }, + { + "completion_length": 115.125, + "epoch": 0.19575, + "grad_norm": 0.7121822834014893, + "kl": 0.07135632634162903, + "learning_rate": 4.861657502246226e-06, + "loss": 0.0029, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 783 + }, + { + "completion_length": 126.625, + "epoch": 0.196, + "grad_norm": 0.8840348720550537, + "kl": 0.03291170299053192, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0013, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 784 + }, + { + "completion_length": 190.625, + "epoch": 0.19625, + "grad_norm": 0.5434854626655579, + "kl": 0.04633787274360657, + "learning_rate": 4.860222550981961e-06, + "loss": 0.0019, + "reward": 1.3125, + "reward_std": 0.873723566532135, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.75, + "step": 785 + }, + { + "completion_length": 138.25, + "epoch": 0.1965, + "grad_norm": 0.6125333309173584, + "kl": 0.04772162437438965, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0019, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 786 + }, + { + "completion_length": 160.375, + "epoch": 0.19675, + "grad_norm": 0.7128032445907593, + "kl": 0.07775954157114029, + "learning_rate": 4.858780410070484e-06, + "loss": 0.0031, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.643750011920929, + "rewards/_format_reward": 1.0, + "step": 787 + }, + { + "completion_length": 172.125, + "epoch": 0.197, + "grad_norm": 0.9440908432006836, + "kl": 0.06862717866897583, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0027, + "reward": 1.1437499523162842, + "reward_std": 0.8317097425460815, + "rewards/_accuracy_reward": 0.39374998211860657, + "rewards/_format_reward": 0.75, + "step": 788 + }, + { + "completion_length": 198.25, + "epoch": 0.19725, + "grad_norm": 0.6834743618965149, + "kl": 0.055516257882118225, + "learning_rate": 4.8573310839048085e-06, + "loss": 0.0022, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 789 + }, + { + "completion_length": 172.125, + "epoch": 0.1975, + "grad_norm": 0.5438332557678223, + "kl": 0.07543152570724487, + "learning_rate": 4.856603727730446e-06, + "loss": 0.003, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 790 + }, + { + "completion_length": 134.125, + "epoch": 0.19775, + "grad_norm": 0.6861331462860107, + "kl": 0.06321967393159866, + "learning_rate": 4.855874576899831e-06, + "loss": 0.0025, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 1.0, + "step": 791 + }, + { + "completion_length": 104.75, + "epoch": 0.198, + "grad_norm": 1.0154536962509155, + "kl": 0.08728273957967758, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0035, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 792 + }, + { + "completion_length": 132.0, + "epoch": 0.19825, + "grad_norm": 0.7148137092590332, + "kl": 0.09352617710828781, + "learning_rate": 4.854410893492326e-06, + "loss": 0.0037, + "reward": 1.4375, + "reward_std": 0.8530408143997192, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.75, + "step": 793 + }, + { + "completion_length": 136.25, + "epoch": 0.1985, + "grad_norm": 0.6906197667121887, + "kl": 0.056963000446558, + "learning_rate": 4.853676362030095e-06, + "loss": 0.0023, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 794 + }, + { + "completion_length": 176.5, + "epoch": 0.19875, + "grad_norm": 0.04440414160490036, + "kl": 0.08904334902763367, + "learning_rate": 4.852940038140927e-06, + "loss": 0.0036, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 795 + }, + { + "completion_length": 120.375, + "epoch": 0.199, + "grad_norm": 0.07577986270189285, + "kl": 0.07960440963506699, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 796 + }, + { + "completion_length": 169.625, + "epoch": 0.19925, + "grad_norm": 0.6561159491539001, + "kl": 0.06286200135946274, + "learning_rate": 4.851462015326114e-06, + "loss": 0.0025, + "reward": 1.502500057220459, + "reward_std": 0.7540509700775146, + "rewards/_accuracy_reward": 0.627500057220459, + "rewards/_format_reward": 0.875, + "step": 797 + }, + { + "completion_length": 117.75, + "epoch": 0.1995, + "grad_norm": 0.6921773552894592, + "kl": 0.03880568593740463, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0016, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 798 + }, + { + "completion_length": 174.125, + "epoch": 0.19975, + "grad_norm": 0.025037772953510284, + "kl": 0.05044008791446686, + "learning_rate": 4.8499768295502e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 799 + }, + { + "completion_length": 117.25, + "epoch": 0.2, + "grad_norm": 0.742163896560669, + "kl": 0.08251883089542389, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0033, + "reward": 1.6887500286102295, + "reward_std": 0.43590423464775085, + "rewards/_accuracy_reward": 0.6887500286102295, + "rewards/_format_reward": 1.0, + "step": 800 + }, + { + "completion_length": 134.5, + "epoch": 0.20025, + "grad_norm": 0.8082313537597656, + "kl": 0.06866247951984406, + "learning_rate": 4.8484844853373205e-06, + "loss": 0.0027, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 801 + }, + { + "completion_length": 140.375, + "epoch": 0.2005, + "grad_norm": 0.04818421229720116, + "kl": 0.07798528671264648, + "learning_rate": 4.847735630236773e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 802 + }, + { + "completion_length": 136.625, + "epoch": 0.20075, + "grad_norm": 0.741947591304779, + "kl": 0.07387517392635345, + "learning_rate": 4.846984987233414e-06, + "loss": 0.003, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 803 + }, + { + "completion_length": 142.625, + "epoch": 0.201, + "grad_norm": 0.862678050994873, + "kl": 0.06001199409365654, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0024, + "reward": 1.4187500476837158, + "reward_std": 0.7235515117645264, + "rewards/_accuracy_reward": 0.543749988079071, + "rewards/_format_reward": 0.875, + "step": 804 + }, + { + "completion_length": 156.5, + "epoch": 0.20125, + "grad_norm": 0.6630946397781372, + "kl": 0.06096571311354637, + "learning_rate": 4.845478339806211e-06, + "loss": 0.0024, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 805 + }, + { + "completion_length": 128.875, + "epoch": 0.2015, + "grad_norm": 0.6322165131568909, + "kl": 0.054042182862758636, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0022, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 806 + }, + { + "completion_length": 116.125, + "epoch": 0.20175, + "grad_norm": 0.06692986190319061, + "kl": 0.0923774242401123, + "learning_rate": 4.843964547645221e-06, + "loss": 0.0037, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 807 + }, + { + "completion_length": 110.75, + "epoch": 0.202, + "grad_norm": 1.2808027267456055, + "kl": 0.07200721651315689, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0029, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 808 + }, + { + "completion_length": 153.25, + "epoch": 0.20225, + "grad_norm": 0.6430820822715759, + "kl": 0.061430469155311584, + "learning_rate": 4.842443615361718e-06, + "loss": 0.0025, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 809 + }, + { + "completion_length": 117.125, + "epoch": 0.2025, + "grad_norm": 0.9564692974090576, + "kl": 0.09131399542093277, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0037, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 810 + }, + { + "completion_length": 163.375, + "epoch": 0.20275, + "grad_norm": 0.6316709518432617, + "kl": 0.05354610085487366, + "learning_rate": 4.840915547588725e-06, + "loss": 0.0021, + "reward": 1.3125, + "reward_std": 0.4299086630344391, + "rewards/_accuracy_reward": 0.3124999701976776, + "rewards/_format_reward": 1.0, + "step": 811 + }, + { + "completion_length": 169.5, + "epoch": 0.203, + "grad_norm": 0.6619630455970764, + "kl": 0.07358560711145401, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0029, + "reward": 1.287500023841858, + "reward_std": 0.8786150217056274, + "rewards/_accuracy_reward": 0.5375000238418579, + "rewards/_format_reward": 0.75, + "step": 812 + }, + { + "completion_length": 161.875, + "epoch": 0.20325, + "grad_norm": 0.6946004629135132, + "kl": 0.06069519370794296, + "learning_rate": 4.839380348981002e-06, + "loss": 0.0024, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 813 + }, + { + "completion_length": 120.0, + "epoch": 0.2035, + "grad_norm": 0.7291589379310608, + "kl": 0.07319469004869461, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0029, + "reward": 1.412500023841858, + "reward_std": 0.7273975014686584, + "rewards/_accuracy_reward": 0.6625000238418579, + "rewards/_format_reward": 0.75, + "step": 814 + }, + { + "completion_length": 155.75, + "epoch": 0.20375, + "grad_norm": 0.6683049201965332, + "kl": 0.06487289816141129, + "learning_rate": 4.83783802421503e-06, + "loss": 0.0026, + "reward": 1.7575000524520874, + "reward_std": 0.449150025844574, + "rewards/_accuracy_reward": 0.7574999928474426, + "rewards/_format_reward": 1.0, + "step": 815 + }, + { + "completion_length": 60.625, + "epoch": 0.204, + "grad_norm": 1.0868297815322876, + "kl": 0.07810930162668228, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0031, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 816 + }, + { + "completion_length": 153.375, + "epoch": 0.20425, + "grad_norm": 0.03452404588460922, + "kl": 0.06881213933229446, + "learning_rate": 4.836288577988997e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 817 + }, + { + "completion_length": 114.0, + "epoch": 0.2045, + "grad_norm": 0.8613672256469727, + "kl": 0.0681089535355568, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0027, + "reward": 1.1624999046325684, + "reward_std": 0.6214901804924011, + "rewards/_accuracy_reward": 0.28749996423721313, + "rewards/_format_reward": 0.875, + "step": 818 + }, + { + "completion_length": 133.375, + "epoch": 0.20475, + "grad_norm": 0.030039411038160324, + "kl": 0.06410800665616989, + "learning_rate": 4.834732015022786e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 819 + }, + { + "completion_length": 129.625, + "epoch": 0.205, + "grad_norm": 0.03727323189377785, + "kl": 0.06535054743289948, + "learning_rate": 4.833951066243004e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 820 + }, + { + "completion_length": 127.125, + "epoch": 0.20525, + "grad_norm": 0.7753097414970398, + "kl": 0.05497225001454353, + "learning_rate": 4.833168340057957e-06, + "loss": 0.0022, + "reward": 1.6687500476837158, + "reward_std": 0.4613160789012909, + "rewards/_accuracy_reward": 0.6687500476837158, + "rewards/_format_reward": 1.0, + "step": 821 + }, + { + "completion_length": 117.5, + "epoch": 0.2055, + "grad_norm": 1.0983308553695679, + "kl": 0.07010284066200256, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0028, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 822 + }, + { + "completion_length": 182.0, + "epoch": 0.20575, + "grad_norm": 0.9410927891731262, + "kl": 0.08402031660079956, + "learning_rate": 4.831597557857736e-06, + "loss": 0.0034, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 823 + }, + { + "completion_length": 100.375, + "epoch": 0.206, + "grad_norm": 0.8562926650047302, + "kl": 0.10385450720787048, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0042, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 824 + }, + { + "completion_length": 92.875, + "epoch": 0.20625, + "grad_norm": 0.051143430173397064, + "kl": 0.07470560818910599, + "learning_rate": 4.830019673206997e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 825 + }, + { + "completion_length": 173.75, + "epoch": 0.2065, + "grad_norm": 0.4477454423904419, + "kl": 0.055387578904628754, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0022, + "reward": 0.9087499380111694, + "reward_std": 0.3676348030567169, + "rewards/_accuracy_reward": 0.03374999761581421, + "rewards/_format_reward": 0.875, + "step": 826 + }, + { + "completion_length": 123.5, + "epoch": 0.20675, + "grad_norm": 0.7335723638534546, + "kl": 0.06967341154813766, + "learning_rate": 4.828434690912251e-06, + "loss": 0.0028, + "reward": 1.4562499523162842, + "reward_std": 0.45781898498535156, + "rewards/_accuracy_reward": 0.45624998211860657, + "rewards/_format_reward": 1.0, + "step": 827 + }, + { + "completion_length": 158.375, + "epoch": 0.207, + "grad_norm": 0.787886917591095, + "kl": 0.04925302416086197, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.002, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 828 + }, + { + "completion_length": 145.5, + "epoch": 0.20725, + "grad_norm": 0.045704782009124756, + "kl": 0.06455915421247482, + "learning_rate": 4.826842615801628e-06, + "loss": 0.0026, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 829 + }, + { + "completion_length": 159.75, + "epoch": 0.2075, + "grad_norm": 0.7860205173492432, + "kl": 0.06909380853176117, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0028, + "reward": 1.212499976158142, + "reward_std": 0.6180325746536255, + "rewards/_accuracy_reward": 0.3374999761581421, + "rewards/_format_reward": 0.875, + "step": 830 + }, + { + "completion_length": 141.125, + "epoch": 0.20775, + "grad_norm": 0.7281314134597778, + "kl": 0.05228433758020401, + "learning_rate": 4.825243452724865e-06, + "loss": 0.0021, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 831 + }, + { + "completion_length": 105.375, + "epoch": 0.208, + "grad_norm": 0.6620430946350098, + "kl": 0.057157788425683975, + "learning_rate": 4.824441214720629e-06, + "loss": 0.0023, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 832 + }, + { + "completion_length": 161.375, + "epoch": 0.20825, + "grad_norm": 0.6484058499336243, + "kl": 0.05974971503019333, + "learning_rate": 4.823637206553292e-06, + "loss": 0.0024, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 833 + }, + { + "completion_length": 150.125, + "epoch": 0.2085, + "grad_norm": 0.6747198104858398, + "kl": 0.0536816343665123, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0021, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 834 + }, + { + "completion_length": 149.125, + "epoch": 0.20875, + "grad_norm": 0.7284613847732544, + "kl": 0.04191889986395836, + "learning_rate": 4.822023882179811e-06, + "loss": 0.0017, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 835 + }, + { + "completion_length": 111.25, + "epoch": 0.209, + "grad_norm": 0.880097508430481, + "kl": 0.056231893599033356, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0022, + "reward": 1.693750023841858, + "reward_std": 0.4271479547023773, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 1.0, + "step": 836 + }, + { + "completion_length": 110.375, + "epoch": 0.20925, + "grad_norm": 0.04309391230344772, + "kl": 0.044359609484672546, + "learning_rate": 4.820403484518889e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 837 + }, + { + "completion_length": 160.125, + "epoch": 0.2095, + "grad_norm": 0.6122051477432251, + "kl": 0.07158859074115753, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0029, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 838 + }, + { + "completion_length": 144.75, + "epoch": 0.20975, + "grad_norm": 0.739141047000885, + "kl": 0.04880441725254059, + "learning_rate": 4.818776018506538e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 839 + }, + { + "completion_length": 158.75, + "epoch": 0.21, + "grad_norm": 0.7204335331916809, + "kl": 0.08992164582014084, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0036, + "reward": 1.693750023841858, + "reward_std": 0.4271479547023773, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 1.0, + "step": 840 + }, + { + "completion_length": 156.25, + "epoch": 0.21025, + "grad_norm": 0.9104655385017395, + "kl": 0.060113731771707535, + "learning_rate": 4.817141489100302e-06, + "loss": 0.0024, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 841 + }, + { + "completion_length": 163.0, + "epoch": 0.2105, + "grad_norm": 0.6259116530418396, + "kl": 0.05476780980825424, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0022, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 842 + }, + { + "completion_length": 137.0, + "epoch": 0.21075, + "grad_norm": 0.04252244159579277, + "kl": 0.06695520132780075, + "learning_rate": 4.815499901279242e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 843 + }, + { + "completion_length": 139.75, + "epoch": 0.211, + "grad_norm": 0.025353508070111275, + "kl": 0.05481298640370369, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 844 + }, + { + "completion_length": 78.125, + "epoch": 0.21125, + "grad_norm": 1.239617943763733, + "kl": 0.08213215321302414, + "learning_rate": 4.8138512600439165e-06, + "loss": 0.0033, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 845 + }, + { + "completion_length": 151.0, + "epoch": 0.2115, + "grad_norm": 0.6661514639854431, + "kl": 0.061599262058734894, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0025, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 846 + }, + { + "completion_length": 188.625, + "epoch": 0.21175, + "grad_norm": 0.6768038272857666, + "kl": 0.08369094878435135, + "learning_rate": 4.812195570416374e-06, + "loss": 0.0033, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 847 + }, + { + "completion_length": 122.875, + "epoch": 0.212, + "grad_norm": 0.029910584911704063, + "kl": 0.048999685794115067, + "learning_rate": 4.811365084030784e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 848 + }, + { + "completion_length": 118.5, + "epoch": 0.21225, + "grad_norm": 0.8244682550430298, + "kl": 0.0865970253944397, + "learning_rate": 4.810532837440134e-06, + "loss": 0.0035, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 849 + }, + { + "completion_length": 151.25, + "epoch": 0.2125, + "grad_norm": 0.03901531174778938, + "kl": 0.05925080180168152, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 850 + }, + { + "completion_length": 99.125, + "epoch": 0.21275, + "grad_norm": 0.764251708984375, + "kl": 0.08566372096538544, + "learning_rate": 4.808863066180167e-06, + "loss": 0.0034, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 851 + }, + { + "completion_length": 149.375, + "epoch": 0.213, + "grad_norm": 0.04004070907831192, + "kl": 0.0646246075630188, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 852 + }, + { + "completion_length": 148.875, + "epoch": 0.21325, + "grad_norm": 0.6666520237922668, + "kl": 0.037436630576848984, + "learning_rate": 4.807186261722886e-06, + "loss": 0.0015, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 853 + }, + { + "completion_length": 127.5, + "epoch": 0.2135, + "grad_norm": 0.6818517446517944, + "kl": 0.04620020091533661, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0018, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 854 + }, + { + "completion_length": 176.625, + "epoch": 0.21375, + "grad_norm": 0.7384047508239746, + "kl": 0.05121048539876938, + "learning_rate": 4.80550242917613e-06, + "loss": 0.002, + "reward": 1.7512500286102295, + "reward_std": 0.460603266954422, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 0.875, + "step": 855 + }, + { + "completion_length": 154.25, + "epoch": 0.214, + "grad_norm": 0.6194064617156982, + "kl": 0.059958089143037796, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0024, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 856 + }, + { + "completion_length": 98.0, + "epoch": 0.21425, + "grad_norm": 0.8191832304000854, + "kl": 0.058027323335409164, + "learning_rate": 4.803811573669143e-06, + "loss": 0.0023, + "reward": 1.4562499523162842, + "reward_std": 0.45781898498535156, + "rewards/_accuracy_reward": 0.45624998211860657, + "rewards/_format_reward": 1.0, + "step": 857 + }, + { + "completion_length": 108.375, + "epoch": 0.2145, + "grad_norm": 0.6157840490341187, + "kl": 0.03446981683373451, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0014, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 858 + }, + { + "completion_length": 139.375, + "epoch": 0.21475, + "grad_norm": 0.5992308259010315, + "kl": 0.03977646678686142, + "learning_rate": 4.802113700352567e-06, + "loss": 0.0016, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 859 + }, + { + "completion_length": 130.125, + "epoch": 0.215, + "grad_norm": 0.6911972165107727, + "kl": 0.06907260417938232, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0028, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 860 + }, + { + "completion_length": 144.25, + "epoch": 0.21525, + "grad_norm": 0.928624153137207, + "kl": 0.06295520812273026, + "learning_rate": 4.800408814398414e-06, + "loss": 0.0025, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 861 + }, + { + "completion_length": 122.625, + "epoch": 0.2155, + "grad_norm": 0.8381000757217407, + "kl": 0.05789494886994362, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0023, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 862 + }, + { + "completion_length": 152.625, + "epoch": 0.21575, + "grad_norm": 0.5595877766609192, + "kl": 0.057784553617239, + "learning_rate": 4.798696921000066e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 863 + }, + { + "completion_length": 132.5, + "epoch": 0.216, + "grad_norm": 0.02260027453303337, + "kl": 0.05030575767159462, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 864 + }, + { + "completion_length": 118.875, + "epoch": 0.21625, + "grad_norm": 0.7802910804748535, + "kl": 0.05282498896121979, + "learning_rate": 4.796978025372247e-06, + "loss": 0.0021, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.875, + "step": 865 + }, + { + "completion_length": 156.125, + "epoch": 0.2165, + "grad_norm": 0.02012362889945507, + "kl": 0.04851672798395157, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 866 + }, + { + "completion_length": 138.5, + "epoch": 0.21675, + "grad_norm": 0.03598388284444809, + "kl": 0.0767478421330452, + "learning_rate": 4.795252132751008e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 867 + }, + { + "completion_length": 121.125, + "epoch": 0.217, + "grad_norm": 0.9107113480567932, + "kl": 0.0775720402598381, + "learning_rate": 4.794386564209953e-06, + "loss": 0.0031, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 868 + }, + { + "completion_length": 168.875, + "epoch": 0.21725, + "grad_norm": 0.6234192848205566, + "kl": 0.06976676732301712, + "learning_rate": 4.793519248393721e-06, + "loss": 0.0028, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.6375000476837158, + "rewards/_format_reward": 0.875, + "step": 869 + }, + { + "completion_length": 133.75, + "epoch": 0.2175, + "grad_norm": 0.7727608680725098, + "kl": 0.12097954005002975, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0048, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 870 + }, + { + "completion_length": 146.625, + "epoch": 0.21775, + "grad_norm": 0.5868052244186401, + "kl": 0.042058780789375305, + "learning_rate": 4.791779377579051e-06, + "loss": 0.0017, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 871 + }, + { + "completion_length": 203.625, + "epoch": 0.218, + "grad_norm": 0.023968705907464027, + "kl": 0.05295765399932861, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 872 + }, + { + "completion_length": 167.625, + "epoch": 0.21825, + "grad_norm": 0.563790500164032, + "kl": 0.03417491540312767, + "learning_rate": 4.790032525606945e-06, + "loss": 0.0014, + "reward": 1.1687499284744263, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 1.0, + "step": 873 + }, + { + "completion_length": 104.875, + "epoch": 0.2185, + "grad_norm": 0.6778443455696106, + "kl": 0.05654909834265709, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0023, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 874 + }, + { + "completion_length": 156.25, + "epoch": 0.21875, + "grad_norm": 0.693367600440979, + "kl": 0.05336631089448929, + "learning_rate": 4.788278697798619e-06, + "loss": 0.0021, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 875 + }, + { + "completion_length": 187.125, + "epoch": 0.219, + "grad_norm": 0.6634543538093567, + "kl": 0.06201925501227379, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0025, + "reward": 1.1937499046325684, + "reward_std": 0.33320683240890503, + "rewards/_accuracy_reward": 0.19374999403953552, + "rewards/_format_reward": 1.0, + "step": 876 + }, + { + "completion_length": 169.125, + "epoch": 0.21925, + "grad_norm": 0.625119149684906, + "kl": 0.06440308690071106, + "learning_rate": 4.786517899496535e-06, + "loss": 0.0026, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 877 + }, + { + "completion_length": 101.875, + "epoch": 0.2195, + "grad_norm": 0.7212375402450562, + "kl": 0.06768248230218887, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0027, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 878 + }, + { + "completion_length": 125.5, + "epoch": 0.21975, + "grad_norm": 0.8191072344779968, + "kl": 0.08255218714475632, + "learning_rate": 4.78475013606439e-06, + "loss": 0.0033, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 879 + }, + { + "completion_length": 165.0, + "epoch": 0.22, + "grad_norm": 0.5947607159614563, + "kl": 0.03656945377588272, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0015, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 880 + }, + { + "completion_length": 84.0, + "epoch": 0.22025, + "grad_norm": 0.7441555857658386, + "kl": 0.1628367006778717, + "learning_rate": 4.7829754128871e-06, + "loss": 0.0065, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 881 + }, + { + "completion_length": 123.5, + "epoch": 0.2205, + "grad_norm": 0.697911262512207, + "kl": 0.08555817604064941, + "learning_rate": 4.782085443082607e-06, + "loss": 0.0034, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 882 + }, + { + "completion_length": 120.125, + "epoch": 0.22075, + "grad_norm": 0.02246098220348358, + "kl": 0.05425877869129181, + "learning_rate": 4.7811937353707776e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 883 + }, + { + "completion_length": 231.875, + "epoch": 0.221, + "grad_norm": 0.4748757779598236, + "kl": 0.04530277103185654, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0018, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.625, + "step": 884 + }, + { + "completion_length": 178.625, + "epoch": 0.22125, + "grad_norm": 0.7197502255439758, + "kl": 0.05272772163152695, + "learning_rate": 4.779405108942722e-06, + "loss": 0.0021, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 885 + }, + { + "completion_length": 112.125, + "epoch": 0.2215, + "grad_norm": 0.07369816303253174, + "kl": 0.053687069565057755, + "learning_rate": 4.778508191588613e-06, + "loss": 0.0021, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 886 + }, + { + "completion_length": 111.25, + "epoch": 0.22175, + "grad_norm": 0.04976990818977356, + "kl": 0.053799863904714584, + "learning_rate": 4.7776095390514e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 887 + }, + { + "completion_length": 105.25, + "epoch": 0.222, + "grad_norm": 0.9817723631858826, + "kl": 0.0718049630522728, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0029, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 888 + }, + { + "completion_length": 160.75, + "epoch": 0.22225, + "grad_norm": 0.8418934345245361, + "kl": 0.046997714787721634, + "learning_rate": 4.775807031166428e-06, + "loss": 0.0019, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 889 + }, + { + "completion_length": 164.125, + "epoch": 0.2225, + "grad_norm": 0.637934148311615, + "kl": 0.05987370014190674, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0024, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 890 + }, + { + "completion_length": 205.5, + "epoch": 0.22275, + "grad_norm": 0.4765873849391937, + "kl": 0.052201397716999054, + "learning_rate": 4.773997590778558e-06, + "loss": 0.0021, + "reward": 1.4387500286102295, + "reward_std": 0.7156503200531006, + "rewards/_accuracy_reward": 0.5637500286102295, + "rewards/_format_reward": 0.875, + "step": 891 + }, + { + "completion_length": 173.25, + "epoch": 0.223, + "grad_norm": 0.5394091010093689, + "kl": 0.07298759371042252, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0029, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 892 + }, + { + "completion_length": 156.375, + "epoch": 0.22325, + "grad_norm": 0.7306042909622192, + "kl": 0.06932543963193893, + "learning_rate": 4.77218122339966e-06, + "loss": 0.0028, + "reward": 1.5012500286102295, + "reward_std": 0.4208982586860657, + "rewards/_accuracy_reward": 0.5012500286102295, + "rewards/_format_reward": 1.0, + "step": 893 + }, + { + "completion_length": 138.75, + "epoch": 0.2235, + "grad_norm": 0.021331820636987686, + "kl": 0.037514664232730865, + "learning_rate": 4.771270443816805e-06, + "loss": 0.0015, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 894 + }, + { + "completion_length": 103.25, + "epoch": 0.22375, + "grad_norm": 0.024962345138192177, + "kl": 0.04039287567138672, + "learning_rate": 4.770357934562704e-06, + "loss": 0.0016, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 895 + }, + { + "completion_length": 129.5, + "epoch": 0.224, + "grad_norm": 0.022136209532618523, + "kl": 0.05364343896508217, + "learning_rate": 4.769443696332272e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 896 + }, + { + "completion_length": 123.875, + "epoch": 0.22425, + "grad_norm": 1.0361751317977905, + "kl": 0.08603714406490326, + "learning_rate": 4.7685277298217425e-06, + "loss": 0.0034, + "reward": 1.3624999523162842, + "reward_std": 0.404218852519989, + "rewards/_accuracy_reward": 0.36249998211860657, + "rewards/_format_reward": 1.0, + "step": 897 + }, + { + "completion_length": 114.5, + "epoch": 0.2245, + "grad_norm": 0.8131916522979736, + "kl": 0.07286559790372849, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0029, + "reward": 1.337499976158142, + "reward_std": 0.4181165397167206, + "rewards/_accuracy_reward": 0.3374999761581421, + "rewards/_format_reward": 1.0, + "step": 898 + }, + { + "completion_length": 197.25, + "epoch": 0.22475, + "grad_norm": 0.5685933232307434, + "kl": 0.053996216505765915, + "learning_rate": 4.766690614751897e-06, + "loss": 0.0022, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 899 + }, + { + "completion_length": 80.25, + "epoch": 0.225, + "grad_norm": 0.023312676697969437, + "kl": 0.046684399247169495, + "learning_rate": 4.765769467591626e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 900 + }, + { + "completion_length": 173.625, + "epoch": 0.22525, + "grad_norm": 0.6596946120262146, + "kl": 0.04578675329685211, + "learning_rate": 4.76484659494934e-06, + "loss": 0.0018, + "reward": 1.6637499332427979, + "reward_std": 0.4691310524940491, + "rewards/_accuracy_reward": 0.6637499928474426, + "rewards/_format_reward": 1.0, + "step": 901 + }, + { + "completion_length": 91.875, + "epoch": 0.2255, + "grad_norm": 0.9707310795783997, + "kl": 0.03129902854561806, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0013, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 902 + }, + { + "completion_length": 134.25, + "epoch": 0.22575, + "grad_norm": 0.7488545775413513, + "kl": 0.07043536752462387, + "learning_rate": 4.762995676031275e-06, + "loss": 0.0028, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 903 + }, + { + "completion_length": 179.125, + "epoch": 0.226, + "grad_norm": 0.5836617350578308, + "kl": 0.047344304621219635, + "learning_rate": 4.762067631165049e-06, + "loss": 0.0019, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 904 + }, + { + "completion_length": 177.875, + "epoch": 0.22625, + "grad_norm": 0.5657011270523071, + "kl": 0.03409140184521675, + "learning_rate": 4.761137863635921e-06, + "loss": 0.0014, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 905 + }, + { + "completion_length": 183.125, + "epoch": 0.2265, + "grad_norm": 0.481889009475708, + "kl": 0.048406727612018585, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0019, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 906 + }, + { + "completion_length": 135.75, + "epoch": 0.22675, + "grad_norm": 0.6987292170524597, + "kl": 0.042507898062467575, + "learning_rate": 4.759273163422496e-06, + "loss": 0.0017, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 907 + }, + { + "completion_length": 215.875, + "epoch": 0.227, + "grad_norm": 0.5568097829818726, + "kl": 0.04911498725414276, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.002, + "reward": 1.131250023841858, + "reward_std": 0.9902876615524292, + "rewards/_accuracy_reward": 0.5062500238418579, + "rewards/_format_reward": 0.625, + "step": 908 + }, + { + "completion_length": 178.125, + "epoch": 0.22725, + "grad_norm": 0.6918895244598389, + "kl": 0.04337242990732193, + "learning_rate": 4.757401581071203e-06, + "loss": 0.0017, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 909 + }, + { + "completion_length": 170.875, + "epoch": 0.2275, + "grad_norm": 0.04047883674502373, + "kl": 0.06391174346208572, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 910 + }, + { + "completion_length": 190.0, + "epoch": 0.22775, + "grad_norm": 0.6083565950393677, + "kl": 0.04682445526123047, + "learning_rate": 4.755523122283206e-06, + "loss": 0.0019, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 911 + }, + { + "completion_length": 135.5, + "epoch": 0.228, + "grad_norm": 0.7495693564414978, + "kl": 0.0452733151614666, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0018, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 912 + }, + { + "completion_length": 143.0, + "epoch": 0.22825, + "grad_norm": 0.9222220778465271, + "kl": 0.038283564150333405, + "learning_rate": 4.753637792780614e-06, + "loss": 0.0015, + "reward": 1.1624999046325684, + "reward_std": 0.338853120803833, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 0.875, + "step": 913 + }, + { + "completion_length": 123.75, + "epoch": 0.2285, + "grad_norm": 0.6918537616729736, + "kl": 0.04517769068479538, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0018, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 914 + }, + { + "completion_length": 157.5, + "epoch": 0.22875, + "grad_norm": 0.9993019700050354, + "kl": 0.061792753636837006, + "learning_rate": 4.7517455983064694e-06, + "loss": 0.0025, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 915 + }, + { + "completion_length": 137.75, + "epoch": 0.229, + "grad_norm": 0.8047826290130615, + "kl": 0.05265399068593979, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0021, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 916 + }, + { + "completion_length": 70.5, + "epoch": 0.22925, + "grad_norm": 1.218523383140564, + "kl": 0.3203689157962799, + "learning_rate": 4.749846544624725e-06, + "loss": 0.0128, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 917 + }, + { + "completion_length": 166.5, + "epoch": 0.2295, + "grad_norm": 0.7494506239891052, + "kl": 0.047605931758880615, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0019, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 918 + }, + { + "completion_length": 149.875, + "epoch": 0.22975, + "grad_norm": 0.5708422064781189, + "kl": 0.06305453181266785, + "learning_rate": 4.747940637520226e-06, + "loss": 0.0025, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 919 + }, + { + "completion_length": 146.375, + "epoch": 0.23, + "grad_norm": 0.6123313307762146, + "kl": 0.05034901574254036, + "learning_rate": 4.746985115747918e-06, + "loss": 0.002, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 920 + }, + { + "completion_length": 168.75, + "epoch": 0.23025, + "grad_norm": 0.02322268672287464, + "kl": 0.04557321220636368, + "learning_rate": 4.746027882798697e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 921 + }, + { + "completion_length": 197.0, + "epoch": 0.2305, + "grad_norm": 0.6486432552337646, + "kl": 0.06248953938484192, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0025, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 922 + }, + { + "completion_length": 163.75, + "epoch": 0.23075, + "grad_norm": 0.6944877505302429, + "kl": 0.051626596599817276, + "learning_rate": 4.744108286286721e-06, + "loss": 0.0021, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 923 + }, + { + "completion_length": 176.0, + "epoch": 0.231, + "grad_norm": 0.603113055229187, + "kl": 0.07564710080623627, + "learning_rate": 4.743145924185821e-06, + "loss": 0.003, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 924 + }, + { + "completion_length": 177.0, + "epoch": 0.23125, + "grad_norm": 0.5147649645805359, + "kl": 0.06837765127420425, + "learning_rate": 4.742181853831721e-06, + "loss": 0.0027, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 925 + }, + { + "completion_length": 155.75, + "epoch": 0.2315, + "grad_norm": 0.6233566403388977, + "kl": 0.06354863196611404, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0025, + "reward": 1.412500023841858, + "reward_std": 0.36912059783935547, + "rewards/_accuracy_reward": 0.4124999940395355, + "rewards/_format_reward": 1.0, + "step": 926 + }, + { + "completion_length": 184.125, + "epoch": 0.23175, + "grad_norm": 0.5731986165046692, + "kl": 0.050587963312864304, + "learning_rate": 4.740248591301945e-06, + "loss": 0.002, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 927 + }, + { + "completion_length": 100.5, + "epoch": 0.232, + "grad_norm": 1.4272077083587646, + "kl": 0.08796297013759613, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0035, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 928 + }, + { + "completion_length": 114.625, + "epoch": 0.23225, + "grad_norm": 0.027059296146035194, + "kl": 0.060833338648080826, + "learning_rate": 4.738308504586445e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 929 + }, + { + "completion_length": 90.875, + "epoch": 0.2325, + "grad_norm": 0.9342624545097351, + "kl": 0.058008529245853424, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0023, + "reward": 1.6687500476837158, + "reward_std": 0.4613160789012909, + "rewards/_accuracy_reward": 0.668749988079071, + "rewards/_format_reward": 1.0, + "step": 930 + }, + { + "completion_length": 138.625, + "epoch": 0.23275, + "grad_norm": 0.030422937124967575, + "kl": 0.04037817567586899, + "learning_rate": 4.736361599595063e-06, + "loss": 0.0016, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 931 + }, + { + "completion_length": 101.25, + "epoch": 0.233, + "grad_norm": 0.7583170533180237, + "kl": 0.08360082656145096, + "learning_rate": 4.735385592098421e-06, + "loss": 0.0033, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 932 + }, + { + "completion_length": 158.625, + "epoch": 0.23325, + "grad_norm": 0.6318457722663879, + "kl": 0.05788834020495415, + "learning_rate": 4.734407882258408e-06, + "loss": 0.0023, + "reward": 1.625, + "reward_std": 0.40089187026023865, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 1.0, + "step": 933 + }, + { + "completion_length": 176.5, + "epoch": 0.2335, + "grad_norm": 0.6953318119049072, + "kl": 0.0506259948015213, + "learning_rate": 4.733428470819595e-06, + "loss": 0.002, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 934 + }, + { + "completion_length": 118.5, + "epoch": 0.23375, + "grad_norm": 0.021268269047141075, + "kl": 0.03520062938332558, + "learning_rate": 4.732447358527843e-06, + "loss": 0.0014, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 935 + }, + { + "completion_length": 102.875, + "epoch": 0.234, + "grad_norm": 0.8448748588562012, + "kl": 0.07049023360013962, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0028, + "reward": 1.6375000476837158, + "reward_std": 0.7224709987640381, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 0.875, + "step": 936 + }, + { + "completion_length": 159.875, + "epoch": 0.23425, + "grad_norm": 0.8189094066619873, + "kl": 0.0628812164068222, + "learning_rate": 4.730480034375462e-06, + "loss": 0.0025, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 937 + }, + { + "completion_length": 165.875, + "epoch": 0.2345, + "grad_norm": 0.026304002851247787, + "kl": 0.06353601813316345, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 938 + }, + { + "completion_length": 191.375, + "epoch": 0.23475, + "grad_norm": 0.6404834389686584, + "kl": 0.040367912501096725, + "learning_rate": 4.7285059157940765e-06, + "loss": 0.0016, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 939 + }, + { + "completion_length": 180.0, + "epoch": 0.235, + "grad_norm": 0.660751461982727, + "kl": 0.11275404691696167, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0045, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 940 + }, + { + "completion_length": 176.875, + "epoch": 0.23525, + "grad_norm": 0.03521808236837387, + "kl": 0.05358295515179634, + "learning_rate": 4.726525008797194e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 941 + }, + { + "completion_length": 91.25, + "epoch": 0.2355, + "grad_norm": 0.06520809978246689, + "kl": 0.06617758423089981, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 942 + }, + { + "completion_length": 193.625, + "epoch": 0.23575, + "grad_norm": 0.029271895065903664, + "kl": 0.060000792145729065, + "learning_rate": 4.724537319419e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 943 + }, + { + "completion_length": 169.75, + "epoch": 0.236, + "grad_norm": 0.7201210260391235, + "kl": 0.06569670885801315, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0026, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 944 + }, + { + "completion_length": 177.875, + "epoch": 0.23625, + "grad_norm": 0.5506649613380432, + "kl": 0.04273473471403122, + "learning_rate": 4.7225428537143414e-06, + "loss": 0.0017, + "reward": 1.6687500476837158, + "reward_std": 0.4613160789012909, + "rewards/_accuracy_reward": 0.668749988079071, + "rewards/_format_reward": 1.0, + "step": 945 + }, + { + "completion_length": 98.125, + "epoch": 0.2365, + "grad_norm": 0.021767426282167435, + "kl": 0.08275524526834488, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 946 + }, + { + "completion_length": 93.875, + "epoch": 0.23675, + "grad_norm": 0.7869265675544739, + "kl": 0.06405540555715561, + "learning_rate": 4.720541617758707e-06, + "loss": 0.0026, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 947 + }, + { + "completion_length": 172.125, + "epoch": 0.237, + "grad_norm": 0.5938262939453125, + "kl": 0.06068947911262512, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0024, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 948 + }, + { + "completion_length": 112.25, + "epoch": 0.23725, + "grad_norm": 1.1759775876998901, + "kl": 0.2916286885738373, + "learning_rate": 4.718533617648209e-06, + "loss": 0.0117, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 949 + }, + { + "completion_length": 86.75, + "epoch": 0.2375, + "grad_norm": 0.8917730450630188, + "kl": 0.08274582028388977, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0033, + "reward": 1.693750023841858, + "reward_std": 0.4271479547023773, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 1.0, + "step": 950 + }, + { + "completion_length": 155.625, + "epoch": 0.23775, + "grad_norm": 0.02533833496272564, + "kl": 0.04348806291818619, + "learning_rate": 4.716518859499563e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 951 + }, + { + "completion_length": 100.25, + "epoch": 0.238, + "grad_norm": 0.669034481048584, + "kl": 0.039708979427814484, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0016, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 952 + }, + { + "completion_length": 93.25, + "epoch": 0.23825, + "grad_norm": 1.0357922315597534, + "kl": 0.060257647186517715, + "learning_rate": 4.714497349450071e-06, + "loss": 0.0024, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 953 + }, + { + "completion_length": 111.75, + "epoch": 0.2385, + "grad_norm": 0.7821860909461975, + "kl": 0.03877865895628929, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0016, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 954 + }, + { + "completion_length": 155.625, + "epoch": 0.23875, + "grad_norm": 0.04612157121300697, + "kl": 0.04425455257296562, + "learning_rate": 4.712469093657605e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 955 + }, + { + "completion_length": 149.75, + "epoch": 0.239, + "grad_norm": 0.7191103100776672, + "kl": 0.06059260666370392, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0024, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 956 + }, + { + "completion_length": 169.0, + "epoch": 0.23925, + "grad_norm": 0.688480019569397, + "kl": 0.05014285817742348, + "learning_rate": 4.710434098300584e-06, + "loss": 0.002, + "reward": 1.5187499523162842, + "reward_std": 0.7323824167251587, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 0.875, + "step": 957 + }, + { + "completion_length": 86.0, + "epoch": 0.2395, + "grad_norm": 0.7771919369697571, + "kl": 0.06990315020084381, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0028, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 958 + }, + { + "completion_length": 126.75, + "epoch": 0.23975, + "grad_norm": 0.815279483795166, + "kl": 0.07264941185712814, + "learning_rate": 4.7083923695779546e-06, + "loss": 0.0029, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 959 + }, + { + "completion_length": 171.0, + "epoch": 0.24, + "grad_norm": 0.5627329349517822, + "kl": 0.03297321870923042, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0013, + "reward": 1.0625, + "reward_std": 0.5403371453285217, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.875, + "step": 960 + }, + { + "completion_length": 158.25, + "epoch": 0.24025, + "grad_norm": 0.4752623438835144, + "kl": 0.046661727130413055, + "learning_rate": 4.706343913709178e-06, + "loss": 0.0019, + "reward": 1.631250023841858, + "reward_std": 0.7382108569145203, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 961 + }, + { + "completion_length": 150.5, + "epoch": 0.2405, + "grad_norm": 0.06294679641723633, + "kl": 0.06743825972080231, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 962 + }, + { + "completion_length": 148.625, + "epoch": 0.24075, + "grad_norm": 0.8773144483566284, + "kl": 0.05056190490722656, + "learning_rate": 4.704288736934207e-06, + "loss": 0.002, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 963 + }, + { + "completion_length": 144.375, + "epoch": 0.241, + "grad_norm": 0.5290398001670837, + "kl": 0.03290877863764763, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0013, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 964 + }, + { + "completion_length": 194.125, + "epoch": 0.24125, + "grad_norm": 0.6033351421356201, + "kl": 0.05277324095368385, + "learning_rate": 4.702226845513465e-06, + "loss": 0.0021, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 1.0, + "step": 965 + }, + { + "completion_length": 169.5, + "epoch": 0.2415, + "grad_norm": 0.6577865481376648, + "kl": 0.05829022079706192, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0023, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 966 + }, + { + "completion_length": 170.5, + "epoch": 0.24175, + "grad_norm": 0.6581544280052185, + "kl": 0.05274191126227379, + "learning_rate": 4.70015824572783e-06, + "loss": 0.0021, + "reward": 1.7825000286102295, + "reward_std": 0.40780770778656006, + "rewards/_accuracy_reward": 0.7825000286102295, + "rewards/_format_reward": 1.0, + "step": 967 + }, + { + "completion_length": 155.75, + "epoch": 0.242, + "grad_norm": 0.6504667401313782, + "kl": 0.04294908419251442, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0017, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 968 + }, + { + "completion_length": 113.125, + "epoch": 0.24225, + "grad_norm": 0.02235039882361889, + "kl": 0.05611787736415863, + "learning_rate": 4.6980829438786176e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 969 + }, + { + "completion_length": 194.25, + "epoch": 0.2425, + "grad_norm": 0.6296171545982361, + "kl": 0.05270276963710785, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0021, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 970 + }, + { + "completion_length": 122.75, + "epoch": 0.24275, + "grad_norm": 0.02494359202682972, + "kl": 0.0459245890378952, + "learning_rate": 4.696000946287558e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 971 + }, + { + "completion_length": 139.5, + "epoch": 0.243, + "grad_norm": 0.6154767870903015, + "kl": 0.053218137472867966, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 972 + }, + { + "completion_length": 145.875, + "epoch": 0.24325, + "grad_norm": 0.6306710243225098, + "kl": 0.05243955925107002, + "learning_rate": 4.693912259296773e-06, + "loss": 0.0021, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 973 + }, + { + "completion_length": 181.375, + "epoch": 0.2435, + "grad_norm": 0.5748218297958374, + "kl": 0.07226546108722687, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.0029, + "reward": 0.9124999046325684, + "reward_std": 0.6384971141815186, + "rewards/_accuracy_reward": 0.16249999403953552, + "rewards/_format_reward": 0.75, + "step": 974 + }, + { + "completion_length": 81.0, + "epoch": 0.24375, + "grad_norm": 0.027224192395806313, + "kl": 0.04066862910985947, + "learning_rate": 4.69181688926877e-06, + "loss": 0.0016, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 975 + }, + { + "completion_length": 139.375, + "epoch": 0.244, + "grad_norm": 0.03555990010499954, + "kl": 0.0650315135717392, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 976 + }, + { + "completion_length": 84.125, + "epoch": 0.24425, + "grad_norm": 1.2255222797393799, + "kl": 0.055360615253448486, + "learning_rate": 4.689714842586406e-06, + "loss": 0.0022, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 977 + }, + { + "completion_length": 161.875, + "epoch": 0.2445, + "grad_norm": 0.044598329812288284, + "kl": 0.059485312551259995, + "learning_rate": 4.688661317500045e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 978 + }, + { + "completion_length": 122.125, + "epoch": 0.24475, + "grad_norm": 0.9748655557632446, + "kl": 0.13712793588638306, + "learning_rate": 4.687606125652882e-06, + "loss": 0.0055, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 979 + }, + { + "completion_length": 142.75, + "epoch": 0.245, + "grad_norm": 0.02790018729865551, + "kl": 0.052116554230451584, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 980 + }, + { + "completion_length": 154.0, + "epoch": 0.24525, + "grad_norm": 0.6151390075683594, + "kl": 0.06243494153022766, + "learning_rate": 4.685490744891713e-06, + "loss": 0.0025, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 981 + }, + { + "completion_length": 144.875, + "epoch": 0.2455, + "grad_norm": 0.7018941640853882, + "kl": 0.03996715694665909, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0016, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 982 + }, + { + "completion_length": 190.875, + "epoch": 0.24575, + "grad_norm": 0.6229122281074524, + "kl": 0.047085534781217575, + "learning_rate": 4.6833687067467185e-06, + "loss": 0.0019, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 983 + }, + { + "completion_length": 156.875, + "epoch": 0.246, + "grad_norm": 0.9269128441810608, + "kl": 0.08278842270374298, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0033, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 984 + }, + { + "completion_length": 130.75, + "epoch": 0.24625, + "grad_norm": 0.8343520760536194, + "kl": 0.030732639133930206, + "learning_rate": 4.681240017681994e-06, + "loss": 0.0012, + "reward": 0.9249999523162842, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 0.875, + "step": 985 + }, + { + "completion_length": 183.75, + "epoch": 0.2465, + "grad_norm": 0.4889462888240814, + "kl": 0.02942308969795704, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0012, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 986 + }, + { + "completion_length": 168.5, + "epoch": 0.24675, + "grad_norm": 0.733590841293335, + "kl": 0.044175948947668076, + "learning_rate": 4.679104684181893e-06, + "loss": 0.0018, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 987 + }, + { + "completion_length": 132.125, + "epoch": 0.247, + "grad_norm": 0.7705006003379822, + "kl": 0.04930579662322998, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.002, + "reward": 1.6637500524520874, + "reward_std": 0.4691310524940491, + "rewards/_accuracy_reward": 0.6637500524520874, + "rewards/_format_reward": 1.0, + "step": 988 + }, + { + "completion_length": 156.5, + "epoch": 0.24725, + "grad_norm": 0.03196730837225914, + "kl": 0.046509016305208206, + "learning_rate": 4.676962712751015e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 989 + }, + { + "completion_length": 136.5, + "epoch": 0.2475, + "grad_norm": 0.7628892660140991, + "kl": 0.10023954510688782, + "learning_rate": 4.675889239849749e-06, + "loss": 0.004, + "reward": 1.306249976158142, + "reward_std": 0.6784633994102478, + "rewards/_accuracy_reward": 0.4312500059604645, + "rewards/_format_reward": 0.875, + "step": 990 + }, + { + "completion_length": 210.625, + "epoch": 0.24775, + "grad_norm": 0.5449919700622559, + "kl": 0.04874037951231003, + "learning_rate": 4.674814109914174e-06, + "loss": 0.0019, + "reward": 1.3125, + "reward_std": 0.873723566532135, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.75, + "step": 991 + }, + { + "completion_length": 102.125, + "epoch": 0.248, + "grad_norm": 0.07450534403324127, + "kl": 0.05309867858886719, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 992 + }, + { + "completion_length": 104.125, + "epoch": 0.24825, + "grad_norm": 0.7853342294692993, + "kl": 0.04614270478487015, + "learning_rate": 4.67265888221639e-06, + "loss": 0.0018, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 993 + }, + { + "completion_length": 125.375, + "epoch": 0.2485, + "grad_norm": 0.8209336400032043, + "kl": 0.040556248277425766, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0016, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 994 + }, + { + "completion_length": 157.125, + "epoch": 0.24875, + "grad_norm": 0.7643418908119202, + "kl": 0.07217823714017868, + "learning_rate": 4.670497036222856e-06, + "loss": 0.0029, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 1.0, + "step": 995 + }, + { + "completion_length": 119.75, + "epoch": 0.249, + "grad_norm": 0.8099098801612854, + "kl": 0.053473103791475296, + "learning_rate": 4.669413633422322e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 996 + }, + { + "completion_length": 155.75, + "epoch": 0.24925, + "grad_norm": 0.620407223701477, + "kl": 0.05117916315793991, + "learning_rate": 4.668328578518933e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 997 + }, + { + "completion_length": 183.625, + "epoch": 0.2495, + "grad_norm": 0.8129308223724365, + "kl": 0.07624640315771103, + "learning_rate": 4.667241872339007e-06, + "loss": 0.003, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.75, + "step": 998 + }, + { + "completion_length": 153.625, + "epoch": 0.24975, + "grad_norm": 0.6230663061141968, + "kl": 0.04454692453145981, + "learning_rate": 4.666153515710118e-06, + "loss": 0.0018, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 999 + }, + { + "completion_length": 121.125, + "epoch": 0.25, + "grad_norm": 0.8729314804077148, + "kl": 0.040495917201042175, + "learning_rate": 4.665063509461098e-06, + "loss": 0.0016, + "reward": 1.6875, + "reward_std": 0.4381372928619385, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.875, + "step": 1000 + }, + { + "completion_length": 118.375, + "epoch": 0.25025, + "grad_norm": 0.7109651565551758, + "kl": 0.1086815893650055, + "learning_rate": 4.66397185442203e-06, + "loss": 0.0043, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1001 + }, + { + "completion_length": 176.75, + "epoch": 0.2505, + "grad_norm": 0.536847710609436, + "kl": 0.0531662292778492, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0021, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 1002 + }, + { + "completion_length": 103.5, + "epoch": 0.25075, + "grad_norm": 0.034843314439058304, + "kl": 0.048501890152692795, + "learning_rate": 4.6617836013003885e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1003 + }, + { + "completion_length": 198.25, + "epoch": 0.251, + "grad_norm": 0.6616882085800171, + "kl": 0.07180707901716232, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0029, + "reward": 1.2625000476837158, + "reward_std": 0.8745407462120056, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.625, + "step": 1004 + }, + { + "completion_length": 110.0, + "epoch": 0.25125, + "grad_norm": 0.9252467155456543, + "kl": 0.08372751623392105, + "learning_rate": 4.65958876301099e-06, + "loss": 0.0033, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1005 + }, + { + "completion_length": 162.875, + "epoch": 0.2515, + "grad_norm": 0.48977896571159363, + "kl": 0.05411114916205406, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0022, + "reward": 1.7825000286102295, + "reward_std": 0.40780770778656006, + "rewards/_accuracy_reward": 0.7825000286102295, + "rewards/_format_reward": 1.0, + "step": 1006 + }, + { + "completion_length": 64.125, + "epoch": 0.25175, + "grad_norm": 1.0273231267929077, + "kl": 0.083249032497406, + "learning_rate": 4.6573873462396935e-06, + "loss": 0.0033, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1007 + }, + { + "completion_length": 165.25, + "epoch": 0.252, + "grad_norm": 0.5020899176597595, + "kl": 0.05609561502933502, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0022, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1008 + }, + { + "completion_length": 142.5, + "epoch": 0.25225, + "grad_norm": 0.7043915390968323, + "kl": 0.05257268622517586, + "learning_rate": 4.655179357692396e-06, + "loss": 0.0021, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1009 + }, + { + "completion_length": 174.0, + "epoch": 0.2525, + "grad_norm": 0.705171525478363, + "kl": 0.06622593849897385, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0026, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1010 + }, + { + "completion_length": 83.75, + "epoch": 0.25275, + "grad_norm": 0.04914606735110283, + "kl": 0.05078301206231117, + "learning_rate": 4.652964804095015e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1011 + }, + { + "completion_length": 101.375, + "epoch": 0.253, + "grad_norm": 1.0606887340545654, + "kl": 0.04142900928854942, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0017, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1012 + }, + { + "completion_length": 128.0, + "epoch": 0.25325, + "grad_norm": 0.742917001247406, + "kl": 0.062295470386743546, + "learning_rate": 4.650743692193462e-06, + "loss": 0.0025, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.75, + "step": 1013 + }, + { + "completion_length": 113.0, + "epoch": 0.2535, + "grad_norm": 0.7006327509880066, + "kl": 0.04041333496570587, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0016, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1014 + }, + { + "completion_length": 174.625, + "epoch": 0.25375, + "grad_norm": 0.7579224109649658, + "kl": 0.07045772671699524, + "learning_rate": 4.648516028753632e-06, + "loss": 0.0028, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 1015 + }, + { + "completion_length": 117.375, + "epoch": 0.254, + "grad_norm": 0.7735872864723206, + "kl": 0.051190085709095, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1016 + }, + { + "completion_length": 163.375, + "epoch": 0.25425, + "grad_norm": 0.7195467352867126, + "kl": 0.06525428593158722, + "learning_rate": 4.646281820561372e-06, + "loss": 0.0026, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1017 + }, + { + "completion_length": 147.125, + "epoch": 0.2545, + "grad_norm": 0.021355951204895973, + "kl": 0.04439732804894447, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1018 + }, + { + "completion_length": 170.0, + "epoch": 0.25475, + "grad_norm": 0.6866489052772522, + "kl": 0.056952353566884995, + "learning_rate": 4.644041074422469e-06, + "loss": 0.0023, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1019 + }, + { + "completion_length": 166.125, + "epoch": 0.255, + "grad_norm": 0.6103510856628418, + "kl": 0.05860072746872902, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0023, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1020 + }, + { + "completion_length": 161.5, + "epoch": 0.25525, + "grad_norm": 0.6660823822021484, + "kl": 0.06608124077320099, + "learning_rate": 4.641793797162625e-06, + "loss": 0.0026, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1021 + }, + { + "completion_length": 92.0, + "epoch": 0.2555, + "grad_norm": 0.07947294414043427, + "kl": 0.09260207414627075, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0037, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1022 + }, + { + "completion_length": 143.75, + "epoch": 0.25575, + "grad_norm": 0.023698054254055023, + "kl": 0.04806230962276459, + "learning_rate": 4.6395399956274334e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1023 + }, + { + "completion_length": 215.5, + "epoch": 0.256, + "grad_norm": 0.5254735350608826, + "kl": 0.037495993077754974, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0015, + "reward": 1.193750023841858, + "reward_std": 0.8304204940795898, + "rewards/_accuracy_reward": 0.4437499940395355, + "rewards/_format_reward": 0.75, + "step": 1024 + }, + { + "completion_length": 138.0, + "epoch": 0.25625, + "grad_norm": 0.02838887646794319, + "kl": 0.06760545819997787, + "learning_rate": 4.637279676682367e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1025 + }, + { + "completion_length": 126.375, + "epoch": 0.2565, + "grad_norm": 0.024629445746541023, + "kl": 0.06675737351179123, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1026 + }, + { + "completion_length": 143.0, + "epoch": 0.25675, + "grad_norm": 0.600119411945343, + "kl": 0.06903208047151566, + "learning_rate": 4.635012847212749e-06, + "loss": 0.0028, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1027 + }, + { + "completion_length": 134.625, + "epoch": 0.257, + "grad_norm": 0.7650855183601379, + "kl": 0.059212468564510345, + "learning_rate": 4.633876993188319e-06, + "loss": 0.0024, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 1028 + }, + { + "completion_length": 180.375, + "epoch": 0.25725, + "grad_norm": 0.5085853338241577, + "kl": 0.04607116058468819, + "learning_rate": 4.632739514123733e-06, + "loss": 0.0018, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.875, + "step": 1029 + }, + { + "completion_length": 154.25, + "epoch": 0.2575, + "grad_norm": 0.6598264575004578, + "kl": 0.05613408610224724, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0022, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 1030 + }, + { + "completion_length": 135.375, + "epoch": 0.25775, + "grad_norm": 0.020788883790373802, + "kl": 0.06492722034454346, + "learning_rate": 4.6304596843402885e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1031 + }, + { + "completion_length": 165.375, + "epoch": 0.258, + "grad_norm": 0.6187155842781067, + "kl": 0.045201320201158524, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0018, + "reward": 1.53125, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.53125, + "rewards/_format_reward": 1.0, + "step": 1032 + }, + { + "completion_length": 195.125, + "epoch": 0.25825, + "grad_norm": 0.5255759358406067, + "kl": 0.05828214809298515, + "learning_rate": 4.628173364807171e-06, + "loss": 0.0023, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1033 + }, + { + "completion_length": 150.0, + "epoch": 0.2585, + "grad_norm": 0.6497631072998047, + "kl": 0.07949826866388321, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0032, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1034 + }, + { + "completion_length": 155.25, + "epoch": 0.25875, + "grad_norm": 0.8581832051277161, + "kl": 0.0458100289106369, + "learning_rate": 4.625880562488908e-06, + "loss": 0.0018, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1035 + }, + { + "completion_length": 181.75, + "epoch": 0.259, + "grad_norm": 0.6497436761856079, + "kl": 0.04667607694864273, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0019, + "reward": 1.0625, + "reward_std": 0.5403372049331665, + "rewards/_accuracy_reward": 0.1875, + "rewards/_format_reward": 0.875, + "step": 1036 + }, + { + "completion_length": 161.125, + "epoch": 0.25925, + "grad_norm": 0.6015608906745911, + "kl": 0.047349605709314346, + "learning_rate": 4.62358128436977e-06, + "loss": 0.0019, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1037 + }, + { + "completion_length": 183.75, + "epoch": 0.2595, + "grad_norm": 0.6029166579246521, + "kl": 0.04961675405502319, + "learning_rate": 4.622429219072854e-06, + "loss": 0.002, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 1038 + }, + { + "completion_length": 176.875, + "epoch": 0.25975, + "grad_norm": 0.5468695759773254, + "kl": 0.049566950649023056, + "learning_rate": 4.6212755374537596e-06, + "loss": 0.002, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1039 + }, + { + "completion_length": 90.125, + "epoch": 0.26, + "grad_norm": 0.7798323035240173, + "kl": 0.0876527652144432, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0035, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1040 + }, + { + "completion_length": 199.375, + "epoch": 0.26025, + "grad_norm": 0.5108053088188171, + "kl": 0.056546930223703384, + "learning_rate": 4.61896332876458e-06, + "loss": 0.0023, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1041 + }, + { + "completion_length": 159.125, + "epoch": 0.2605, + "grad_norm": 0.6133120059967041, + "kl": 0.06025753542780876, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0024, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1042 + }, + { + "completion_length": 156.875, + "epoch": 0.26075, + "grad_norm": 0.6843867897987366, + "kl": 0.04468563199043274, + "learning_rate": 4.616644665345621e-06, + "loss": 0.0018, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1043 + }, + { + "completion_length": 193.875, + "epoch": 0.261, + "grad_norm": 0.034902796149253845, + "kl": 0.05293627455830574, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1044 + }, + { + "completion_length": 221.25, + "epoch": 0.26125, + "grad_norm": 0.5196229219436646, + "kl": 0.044894713908433914, + "learning_rate": 4.614319554259934e-06, + "loss": 0.0018, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1045 + }, + { + "completion_length": 107.5, + "epoch": 0.2615, + "grad_norm": 0.021327383816242218, + "kl": 0.04686171934008598, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1046 + }, + { + "completion_length": 169.375, + "epoch": 0.26175, + "grad_norm": 0.029194172471761703, + "kl": 0.06922190636396408, + "learning_rate": 4.611988002590209e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1047 + }, + { + "completion_length": 186.75, + "epoch": 0.262, + "grad_norm": 0.5589852333068848, + "kl": 0.05440639704465866, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0022, + "reward": 0.90625, + "reward_std": 0.6483150720596313, + "rewards/_accuracy_reward": 0.15625, + "rewards/_format_reward": 0.75, + "step": 1048 + }, + { + "completion_length": 181.625, + "epoch": 0.26225, + "grad_norm": 0.7409690022468567, + "kl": 0.0589996799826622, + "learning_rate": 4.609650017438757e-06, + "loss": 0.0024, + "reward": 1.3762500286102295, + "reward_std": 0.915547251701355, + "rewards/_accuracy_reward": 0.6262500286102295, + "rewards/_format_reward": 0.75, + "step": 1049 + }, + { + "completion_length": 197.375, + "epoch": 0.2625, + "grad_norm": 0.7713769674301147, + "kl": 0.05258062854409218, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0021, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1050 + }, + { + "completion_length": 100.0, + "epoch": 0.26275, + "grad_norm": 0.7785927653312683, + "kl": 0.04665987938642502, + "learning_rate": 4.6073056059274865e-06, + "loss": 0.0019, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1051 + }, + { + "completion_length": 200.125, + "epoch": 0.263, + "grad_norm": 0.6331179738044739, + "kl": 0.052123308181762695, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0021, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1052 + }, + { + "completion_length": 109.75, + "epoch": 0.26325, + "grad_norm": 0.8628464937210083, + "kl": 0.05367436632514, + "learning_rate": 4.604954775197882e-06, + "loss": 0.0021, + "reward": 1.7575000524520874, + "reward_std": 0.449150025844574, + "rewards/_accuracy_reward": 0.7574999928474426, + "rewards/_format_reward": 1.0, + "step": 1053 + }, + { + "completion_length": 184.75, + "epoch": 0.2635, + "grad_norm": 0.5951057076454163, + "kl": 0.050312891602516174, + "learning_rate": 4.603776954863266e-06, + "loss": 0.002, + "reward": 1.1875, + "reward_std": 0.9519716501235962, + "rewards/_accuracy_reward": 0.5625, + "rewards/_format_reward": 0.625, + "step": 1054 + }, + { + "completion_length": 157.5, + "epoch": 0.26375, + "grad_norm": 0.8466554880142212, + "kl": 0.04847400635480881, + "learning_rate": 4.602597532410982e-06, + "loss": 0.0019, + "reward": 1.5499999523162842, + "reward_std": 0.4855042099952698, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 1055 + }, + { + "completion_length": 175.125, + "epoch": 0.264, + "grad_norm": 0.5892555713653564, + "kl": 0.05255506560206413, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0021, + "reward": 1.6887500286102295, + "reward_std": 0.43590423464775085, + "rewards/_accuracy_reward": 0.6887500286102295, + "rewards/_format_reward": 1.0, + "step": 1056 + }, + { + "completion_length": 133.125, + "epoch": 0.26425, + "grad_norm": 0.026735153049230576, + "kl": 0.04194887727499008, + "learning_rate": 4.600233884747355e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1057 + }, + { + "completion_length": 113.5, + "epoch": 0.2645, + "grad_norm": 0.5905328989028931, + "kl": 0.0982663631439209, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0039, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1058 + }, + { + "completion_length": 120.75, + "epoch": 0.26475, + "grad_norm": 1.0292084217071533, + "kl": 0.0494346097111702, + "learning_rate": 4.5978638394070835e-06, + "loss": 0.002, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 1059 + }, + { + "completion_length": 183.25, + "epoch": 0.265, + "grad_norm": 0.6092599630355835, + "kl": 0.060696642845869064, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0024, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1060 + }, + { + "completion_length": 155.75, + "epoch": 0.26525, + "grad_norm": 0.7234264612197876, + "kl": 0.05358058586716652, + "learning_rate": 4.595487403609736e-06, + "loss": 0.0021, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1061 + }, + { + "completion_length": 180.75, + "epoch": 0.2655, + "grad_norm": 0.5621580481529236, + "kl": 0.06479740887880325, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0026, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1062 + }, + { + "completion_length": 107.5, + "epoch": 0.26575, + "grad_norm": 0.46496903896331787, + "kl": 0.037450678646564484, + "learning_rate": 4.593104584594348e-06, + "loss": 0.0015, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1063 + }, + { + "completion_length": 114.25, + "epoch": 0.266, + "grad_norm": 0.7852677702903748, + "kl": 0.0679827630519867, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0027, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1064 + }, + { + "completion_length": 181.75, + "epoch": 0.26625, + "grad_norm": 0.5932605862617493, + "kl": 0.057104308158159256, + "learning_rate": 4.590715389619399e-06, + "loss": 0.0023, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1065 + }, + { + "completion_length": 123.75, + "epoch": 0.2665, + "grad_norm": 0.6589471697807312, + "kl": 0.04012312740087509, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0016, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1066 + }, + { + "completion_length": 68.375, + "epoch": 0.26675, + "grad_norm": 0.8799357414245605, + "kl": 0.0480550192296505, + "learning_rate": 4.588319825962793e-06, + "loss": 0.0019, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1067 + }, + { + "completion_length": 146.625, + "epoch": 0.267, + "grad_norm": 0.6732004284858704, + "kl": 0.045358166098594666, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0018, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.875, + "step": 1068 + }, + { + "completion_length": 183.875, + "epoch": 0.26725, + "grad_norm": 0.040793102234601974, + "kl": 0.07178792357444763, + "learning_rate": 4.58591790092183e-06, + "loss": 0.0029, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1069 + }, + { + "completion_length": 132.5, + "epoch": 0.2675, + "grad_norm": 0.04739204794168472, + "kl": 0.06881429255008698, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1070 + }, + { + "completion_length": 156.75, + "epoch": 0.26775, + "grad_norm": 0.8715057969093323, + "kl": 0.08568653464317322, + "learning_rate": 4.583509621813192e-06, + "loss": 0.0034, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1071 + }, + { + "completion_length": 141.0, + "epoch": 0.268, + "grad_norm": 0.0545993335545063, + "kl": 0.07613251358270645, + "learning_rate": 4.582303101775249e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1072 + }, + { + "completion_length": 154.25, + "epoch": 0.26825, + "grad_norm": 0.7691239714622498, + "kl": 0.07986550033092499, + "learning_rate": 4.581094995972912e-06, + "loss": 0.0032, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1073 + }, + { + "completion_length": 166.75, + "epoch": 0.2685, + "grad_norm": 0.5310074687004089, + "kl": 0.05323049798607826, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1074 + }, + { + "completion_length": 172.625, + "epoch": 0.26875, + "grad_norm": 0.7366254329681396, + "kl": 0.060853827744722366, + "learning_rate": 4.578674030756364e-06, + "loss": 0.0024, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 1075 + }, + { + "completion_length": 157.625, + "epoch": 0.269, + "grad_norm": 0.5994381904602051, + "kl": 0.05490027368068695, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0022, + "reward": 1.0749999284744263, + "reward_std": 0.0707106813788414, + "rewards/_accuracy_reward": 0.07500000298023224, + "rewards/_format_reward": 1.0, + "step": 1076 + }, + { + "completion_length": 160.375, + "epoch": 0.26925, + "grad_norm": 0.023495573550462723, + "kl": 0.04241487383842468, + "learning_rate": 4.576246733538223e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1077 + }, + { + "completion_length": 182.625, + "epoch": 0.2695, + "grad_norm": 0.6254801750183105, + "kl": 0.055535938590765, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0022, + "reward": 1.1687499284744263, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 1.0, + "step": 1078 + }, + { + "completion_length": 116.125, + "epoch": 0.26975, + "grad_norm": 0.04201361909508705, + "kl": 0.07124117761850357, + "learning_rate": 4.5738131117124605e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1079 + }, + { + "completion_length": 122.875, + "epoch": 0.27, + "grad_norm": 0.8643020391464233, + "kl": 0.06395326554775238, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0026, + "reward": 1.7825000286102295, + "reward_std": 0.40780770778656006, + "rewards/_accuracy_reward": 0.7825000286102295, + "rewards/_format_reward": 1.0, + "step": 1080 + }, + { + "completion_length": 146.375, + "epoch": 0.27025, + "grad_norm": 0.019885435700416565, + "kl": 0.038068097084760666, + "learning_rate": 4.571373172692309e-06, + "loss": 0.0015, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1081 + }, + { + "completion_length": 123.625, + "epoch": 0.2705, + "grad_norm": 1.255374550819397, + "kl": 0.05284246429800987, + "learning_rate": 4.570150836556236e-06, + "loss": 0.0021, + "reward": 1.1937499046325684, + "reward_std": 0.33320683240890503, + "rewards/_accuracy_reward": 0.19375000894069672, + "rewards/_format_reward": 1.0, + "step": 1082 + }, + { + "completion_length": 162.5, + "epoch": 0.27075, + "grad_norm": 0.6360936760902405, + "kl": 0.08327899128198624, + "learning_rate": 4.568926923910248e-06, + "loss": 0.0033, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1083 + }, + { + "completion_length": 152.75, + "epoch": 0.271, + "grad_norm": 0.7469035387039185, + "kl": 0.048695940524339676, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0019, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1084 + }, + { + "completion_length": 153.625, + "epoch": 0.27125, + "grad_norm": 0.05639031156897545, + "kl": 0.06517668068408966, + "learning_rate": 4.566474372817971e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1085 + }, + { + "completion_length": 117.75, + "epoch": 0.2715, + "grad_norm": 0.9581111669540405, + "kl": 0.05397922918200493, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0022, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1086 + }, + { + "completion_length": 147.625, + "epoch": 0.27175, + "grad_norm": 0.7434791922569275, + "kl": 0.07150553166866302, + "learning_rate": 4.56401552688638e-06, + "loss": 0.0029, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.28749996423721313, + "rewards/_format_reward": 1.0, + "step": 1087 + }, + { + "completion_length": 138.25, + "epoch": 0.272, + "grad_norm": 0.8051536083221436, + "kl": 0.05290424823760986, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1088 + }, + { + "completion_length": 130.5, + "epoch": 0.27225, + "grad_norm": 0.05256952345371246, + "kl": 0.07315313816070557, + "learning_rate": 4.561550393605541e-06, + "loss": 0.0029, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1089 + }, + { + "completion_length": 154.75, + "epoch": 0.2725, + "grad_norm": 0.03965034335851669, + "kl": 0.07297085970640182, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0029, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1090 + }, + { + "completion_length": 151.375, + "epoch": 0.27275, + "grad_norm": 0.6053784489631653, + "kl": 0.055059581995010376, + "learning_rate": 4.55907898048468e-06, + "loss": 0.0022, + "reward": 1.431249976158142, + "reward_std": 0.47579824924468994, + "rewards/_accuracy_reward": 0.4312499761581421, + "rewards/_format_reward": 1.0, + "step": 1091 + }, + { + "completion_length": 139.125, + "epoch": 0.273, + "grad_norm": 0.8535023331642151, + "kl": 0.037554092705249786, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0015, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1092 + }, + { + "completion_length": 159.375, + "epoch": 0.27325, + "grad_norm": 0.03497467190027237, + "kl": 0.0712779238820076, + "learning_rate": 4.55660129505215e-06, + "loss": 0.0029, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1093 + }, + { + "completion_length": 112.375, + "epoch": 0.2735, + "grad_norm": 0.7117050290107727, + "kl": 0.05563116446137428, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0022, + "reward": 1.431249976158142, + "reward_std": 0.47579821944236755, + "rewards/_accuracy_reward": 0.4312499761581421, + "rewards/_format_reward": 1.0, + "step": 1094 + }, + { + "completion_length": 167.375, + "epoch": 0.27375, + "grad_norm": 0.5020763874053955, + "kl": 0.050325650721788406, + "learning_rate": 4.55411734485541e-06, + "loss": 0.002, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1095 + }, + { + "completion_length": 133.125, + "epoch": 0.274, + "grad_norm": 0.718513011932373, + "kl": 0.05303411930799484, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0021, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1096 + }, + { + "completion_length": 189.375, + "epoch": 0.27425, + "grad_norm": 0.608174741268158, + "kl": 0.05475342273712158, + "learning_rate": 4.551627137461002e-06, + "loss": 0.0022, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1097 + }, + { + "completion_length": 158.25, + "epoch": 0.2745, + "grad_norm": 0.6787841320037842, + "kl": 0.05620495602488518, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0022, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1098 + }, + { + "completion_length": 146.375, + "epoch": 0.27475, + "grad_norm": 0.5872085690498352, + "kl": 0.041002292186021805, + "learning_rate": 4.549130680454532e-06, + "loss": 0.0016, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1099 + }, + { + "completion_length": 161.5, + "epoch": 0.275, + "grad_norm": 0.588703453540802, + "kl": 0.055859677493572235, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0022, + "reward": 1.6687500476837158, + "reward_std": 0.4613160789012909, + "rewards/_accuracy_reward": 0.668749988079071, + "rewards/_format_reward": 1.0, + "step": 1100 + }, + { + "completion_length": 170.5, + "epoch": 0.27525, + "grad_norm": 0.5734580159187317, + "kl": 0.06272434443235397, + "learning_rate": 4.546627981440639e-06, + "loss": 0.0025, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1101 + }, + { + "completion_length": 160.875, + "epoch": 0.2755, + "grad_norm": 0.6104035973548889, + "kl": 0.0559980645775795, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0022, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.6375000476837158, + "rewards/_format_reward": 0.875, + "step": 1102 + }, + { + "completion_length": 177.125, + "epoch": 0.27575, + "grad_norm": 0.5717198252677917, + "kl": 0.09279928356409073, + "learning_rate": 4.544119048042978e-06, + "loss": 0.0037, + "reward": 1.6875, + "reward_std": 0.4381372928619385, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 0.875, + "step": 1103 + }, + { + "completion_length": 196.625, + "epoch": 0.276, + "grad_norm": 0.5511927604675293, + "kl": 0.03467211127281189, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0014, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.625, + "step": 1104 + }, + { + "completion_length": 155.625, + "epoch": 0.27625, + "grad_norm": 0.6550043821334839, + "kl": 0.04956316575407982, + "learning_rate": 4.541603887904198e-06, + "loss": 0.002, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1105 + }, + { + "completion_length": 163.875, + "epoch": 0.2765, + "grad_norm": 0.4754960834980011, + "kl": 0.046637773513793945, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0019, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1106 + }, + { + "completion_length": 172.25, + "epoch": 0.27675, + "grad_norm": 0.031275276094675064, + "kl": 0.04974092170596123, + "learning_rate": 4.5390825086859094e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1107 + }, + { + "completion_length": 149.75, + "epoch": 0.277, + "grad_norm": 0.5498752593994141, + "kl": 0.04549973085522652, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0018, + "reward": 1.4200000762939453, + "reward_std": 0.7127813100814819, + "rewards/_accuracy_reward": 0.5449999570846558, + "rewards/_format_reward": 0.875, + "step": 1108 + }, + { + "completion_length": 152.5, + "epoch": 0.27725, + "grad_norm": 0.6540334224700928, + "kl": 0.03749295696616173, + "learning_rate": 4.536554918068673e-06, + "loss": 0.0015, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1109 + }, + { + "completion_length": 155.125, + "epoch": 0.2775, + "grad_norm": 0.6249139904975891, + "kl": 0.052198849618434906, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1110 + }, + { + "completion_length": 162.75, + "epoch": 0.27775, + "grad_norm": 0.5957991480827332, + "kl": 0.0481363981962204, + "learning_rate": 4.5340211237519685e-06, + "loss": 0.0019, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1111 + }, + { + "completion_length": 84.5, + "epoch": 0.278, + "grad_norm": 0.970755398273468, + "kl": 0.029281822964549065, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0012, + "reward": 1.7575000524520874, + "reward_std": 0.449150025844574, + "rewards/_accuracy_reward": 0.7575000524520874, + "rewards/_format_reward": 1.0, + "step": 1112 + }, + { + "completion_length": 160.5, + "epoch": 0.27825, + "grad_norm": 0.5828734040260315, + "kl": 0.05844755843281746, + "learning_rate": 4.5314811334541695e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1113 + }, + { + "completion_length": 180.625, + "epoch": 0.2785, + "grad_norm": 0.502030611038208, + "kl": 0.05643211305141449, + "learning_rate": 4.530208817229516e-06, + "loss": 0.0023, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1114 + }, + { + "completion_length": 120.375, + "epoch": 0.27875, + "grad_norm": 0.7388814091682434, + "kl": 0.038211237639188766, + "learning_rate": 4.528934954912531e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.4855041801929474, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 1115 + }, + { + "completion_length": 203.375, + "epoch": 0.279, + "grad_norm": 0.4916674494743347, + "kl": 0.050826288759708405, + "learning_rate": 4.527659547473317e-06, + "loss": 0.002, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1116 + }, + { + "completion_length": 111.625, + "epoch": 0.27925, + "grad_norm": 0.6236878037452698, + "kl": 0.06385045498609543, + "learning_rate": 4.526382595883152e-06, + "loss": 0.0026, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1117 + }, + { + "completion_length": 169.125, + "epoch": 0.2795, + "grad_norm": 0.5840001702308655, + "kl": 0.05632218345999718, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0023, + "reward": 1.6262500286102295, + "reward_std": 0.7428312301635742, + "rewards/_accuracy_reward": 0.7512500286102295, + "rewards/_format_reward": 0.875, + "step": 1118 + }, + { + "completion_length": 130.0, + "epoch": 0.27975, + "grad_norm": 0.8513467311859131, + "kl": 0.08324826508760452, + "learning_rate": 4.523824064140961e-06, + "loss": 0.0033, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1119 + }, + { + "completion_length": 149.75, + "epoch": 0.28, + "grad_norm": 0.480070024728775, + "kl": 0.039729684591293335, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0016, + "reward": 1.5012500286102295, + "reward_std": 0.923509418964386, + "rewards/_accuracy_reward": 0.7512500286102295, + "rewards/_format_reward": 0.75, + "step": 1120 + }, + { + "completion_length": 187.0, + "epoch": 0.28025, + "grad_norm": 0.5705462694168091, + "kl": 0.05749392881989479, + "learning_rate": 4.521259367479691e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1121 + }, + { + "completion_length": 92.5, + "epoch": 0.2805, + "grad_norm": 0.5976485013961792, + "kl": 0.06839491426944733, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0027, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1122 + }, + { + "completion_length": 145.75, + "epoch": 0.28075, + "grad_norm": 0.46838706731796265, + "kl": 0.0469730868935585, + "learning_rate": 4.51868851371185e-06, + "loss": 0.0019, + "reward": 1.1687499284744263, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 1.0, + "step": 1123 + }, + { + "completion_length": 138.625, + "epoch": 0.281, + "grad_norm": 0.02328825183212757, + "kl": 0.045329801738262177, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1124 + }, + { + "completion_length": 123.0, + "epoch": 0.28125, + "grad_norm": 0.023147309198975563, + "kl": 0.05815961956977844, + "learning_rate": 4.516111510668707e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1125 + }, + { + "completion_length": 176.125, + "epoch": 0.2815, + "grad_norm": 0.5607932806015015, + "kl": 0.052996277809143066, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0021, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1126 + }, + { + "completion_length": 89.625, + "epoch": 0.28175, + "grad_norm": 0.023232001811265945, + "kl": 0.04320669546723366, + "learning_rate": 4.513528366200258e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1127 + }, + { + "completion_length": 160.875, + "epoch": 0.282, + "grad_norm": 0.5776987671852112, + "kl": 0.05144071206450462, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1128 + }, + { + "completion_length": 172.75, + "epoch": 0.28225, + "grad_norm": 0.5175846219062805, + "kl": 0.05581043288111687, + "learning_rate": 4.510939088175211e-06, + "loss": 0.0022, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 1129 + }, + { + "completion_length": 145.375, + "epoch": 0.2825, + "grad_norm": 0.6114826202392578, + "kl": 0.05112025886774063, + "learning_rate": 4.509642151543043e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1130 + }, + { + "completion_length": 102.75, + "epoch": 0.28275, + "grad_norm": 0.6514590978622437, + "kl": 0.04705269634723663, + "learning_rate": 4.508343684480956e-06, + "loss": 0.0019, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1131 + }, + { + "completion_length": 158.75, + "epoch": 0.283, + "grad_norm": 0.608859658241272, + "kl": 0.1234949603676796, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0049, + "reward": 1.443750023841858, + "reward_std": 0.7022603750228882, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 0.75, + "step": 1132 + }, + { + "completion_length": 149.875, + "epoch": 0.28325, + "grad_norm": 0.8146029710769653, + "kl": 0.056399088352918625, + "learning_rate": 4.505742163023541e-06, + "loss": 0.0023, + "reward": 1.0437499284744263, + "reward_std": 0.5212878584861755, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 0.875, + "step": 1133 + }, + { + "completion_length": 207.125, + "epoch": 0.2835, + "grad_norm": 0.5459649562835693, + "kl": 0.06084807217121124, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0024, + "reward": 1.28125, + "reward_std": 0.8807210922241211, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.625, + "step": 1134 + }, + { + "completion_length": 151.375, + "epoch": 0.28375, + "grad_norm": 0.7785174250602722, + "kl": 0.05954066291451454, + "learning_rate": 4.503134531727652e-06, + "loss": 0.0024, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1135 + }, + { + "completion_length": 152.0, + "epoch": 0.284, + "grad_norm": 0.05124456435441971, + "kl": 0.04783984273672104, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1136 + }, + { + "completion_length": 168.875, + "epoch": 0.28425, + "grad_norm": 0.027116047218441963, + "kl": 0.0674908459186554, + "learning_rate": 4.5005207985365875e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1137 + }, + { + "completion_length": 156.75, + "epoch": 0.2845, + "grad_norm": 0.7124760150909424, + "kl": 0.06690853834152222, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0027, + "reward": 1.756250023841858, + "reward_std": 0.45153507590293884, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 1138 + }, + { + "completion_length": 181.75, + "epoch": 0.28475, + "grad_norm": 0.5964755415916443, + "kl": 0.06595727056264877, + "learning_rate": 4.49790097141223e-06, + "loss": 0.0026, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1139 + }, + { + "completion_length": 156.25, + "epoch": 0.285, + "grad_norm": 0.8890787959098816, + "kl": 0.07360620051622391, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0029, + "reward": 1.631250023841858, + "reward_std": 0.7382108569145203, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 1140 + }, + { + "completion_length": 157.5, + "epoch": 0.28525, + "grad_norm": 0.8517801761627197, + "kl": 0.0814465880393982, + "learning_rate": 4.495275058335029e-06, + "loss": 0.0033, + "reward": 1.4187500476837158, + "reward_std": 0.7235515117645264, + "rewards/_accuracy_reward": 0.543749988079071, + "rewards/_format_reward": 0.875, + "step": 1141 + }, + { + "completion_length": 171.125, + "epoch": 0.2855, + "grad_norm": 0.5479044914245605, + "kl": 0.04513910040259361, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0018, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.6375000476837158, + "rewards/_format_reward": 0.875, + "step": 1142 + }, + { + "completion_length": 164.0, + "epoch": 0.28575, + "grad_norm": 0.05868508666753769, + "kl": 0.06565727293491364, + "learning_rate": 4.49264306730397e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1143 + }, + { + "completion_length": 155.75, + "epoch": 0.286, + "grad_norm": 0.7048906087875366, + "kl": 0.04985063150525093, + "learning_rate": 4.491324795060491e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1144 + }, + { + "completion_length": 129.625, + "epoch": 0.28625, + "grad_norm": 0.017044005915522575, + "kl": 0.04838981479406357, + "learning_rate": 4.490005006336555e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1145 + }, + { + "completion_length": 131.375, + "epoch": 0.2865, + "grad_norm": 0.5756621360778809, + "kl": 0.05202525854110718, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0021, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1146 + }, + { + "completion_length": 152.375, + "epoch": 0.28675, + "grad_norm": 1.1808115243911743, + "kl": 0.06879065185785294, + "learning_rate": 4.487360883468775e-06, + "loss": 0.0028, + "reward": 1.3125, + "reward_std": 0.4299086630344391, + "rewards/_accuracy_reward": 0.3124999701976776, + "rewards/_format_reward": 1.0, + "step": 1147 + }, + { + "completion_length": 175.125, + "epoch": 0.287, + "grad_norm": 0.7814071774482727, + "kl": 0.06425180286169052, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0026, + "reward": 1.46875, + "reward_std": 0.6999680995941162, + "rewards/_accuracy_reward": 0.59375, + "rewards/_format_reward": 0.875, + "step": 1148 + }, + { + "completion_length": 115.75, + "epoch": 0.28725, + "grad_norm": 0.8087677955627441, + "kl": 0.04324078559875488, + "learning_rate": 4.484710706755087e-06, + "loss": 0.0017, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1149 + }, + { + "completion_length": 144.375, + "epoch": 0.2875, + "grad_norm": 0.9671500325202942, + "kl": 0.09105661511421204, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0036, + "reward": 1.5149999856948853, + "reward_std": 0.5187072157859802, + "rewards/_accuracy_reward": 0.5149999856948853, + "rewards/_format_reward": 1.0, + "step": 1150 + }, + { + "completion_length": 156.375, + "epoch": 0.28775, + "grad_norm": 0.7499131560325623, + "kl": 0.07455121725797653, + "learning_rate": 4.482054484268389e-06, + "loss": 0.003, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 1151 + }, + { + "completion_length": 144.75, + "epoch": 0.288, + "grad_norm": 0.08447606861591339, + "kl": 0.10362078249454498, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0041, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1152 + }, + { + "completion_length": 96.125, + "epoch": 0.28825, + "grad_norm": 0.8950498700141907, + "kl": 0.10107363015413284, + "learning_rate": 4.4793922240999935e-06, + "loss": 0.004, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1153 + }, + { + "completion_length": 122.125, + "epoch": 0.2885, + "grad_norm": 1.3117358684539795, + "kl": 0.09728724509477615, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0039, + "reward": 1.5499999523162842, + "reward_std": 0.4855041801929474, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 1154 + }, + { + "completion_length": 167.0, + "epoch": 0.28875, + "grad_norm": 0.6316139101982117, + "kl": 0.08301078528165817, + "learning_rate": 4.476723934359609e-06, + "loss": 0.0033, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.643750011920929, + "rewards/_format_reward": 1.0, + "step": 1155 + }, + { + "completion_length": 130.875, + "epoch": 0.289, + "grad_norm": 0.7130420804023743, + "kl": 0.08175483345985413, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0033, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1156 + }, + { + "completion_length": 135.5, + "epoch": 0.28925, + "grad_norm": 0.06840886920690536, + "kl": 0.07843038439750671, + "learning_rate": 4.474049623175307e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1157 + }, + { + "completion_length": 105.875, + "epoch": 0.2895, + "grad_norm": 0.633524477481842, + "kl": 0.07753744721412659, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0031, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1158 + }, + { + "completion_length": 168.5, + "epoch": 0.28975, + "grad_norm": 0.808603048324585, + "kl": 0.08730296045541763, + "learning_rate": 4.471369298693505e-06, + "loss": 0.0035, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1159 + }, + { + "completion_length": 131.75, + "epoch": 0.29, + "grad_norm": 0.039577484130859375, + "kl": 0.06506015360355377, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1160 + }, + { + "completion_length": 107.25, + "epoch": 0.29025, + "grad_norm": 0.7665241360664368, + "kl": 0.09168189764022827, + "learning_rate": 4.468682969078935e-06, + "loss": 0.0037, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1161 + }, + { + "completion_length": 148.875, + "epoch": 0.2905, + "grad_norm": 0.6652460694313049, + "kl": 0.05769990384578705, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0023, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 1162 + }, + { + "completion_length": 180.5, + "epoch": 0.29075, + "grad_norm": 0.6770204305648804, + "kl": 0.0662575513124466, + "learning_rate": 4.465990642514622e-06, + "loss": 0.0027, + "reward": 1.537500023841858, + "reward_std": 0.7322909235954285, + "rewards/_accuracy_reward": 0.6625000238418579, + "rewards/_format_reward": 0.875, + "step": 1163 + }, + { + "completion_length": 110.0, + "epoch": 0.291, + "grad_norm": 0.037855084985494614, + "kl": 0.06914416700601578, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1164 + }, + { + "completion_length": 155.125, + "epoch": 0.29125, + "grad_norm": 0.48786526918411255, + "kl": 0.05076320096850395, + "learning_rate": 4.463292327201862e-06, + "loss": 0.002, + "reward": 1.1837499141693115, + "reward_std": 0.3386079967021942, + "rewards/_accuracy_reward": 0.1837499886751175, + "rewards/_format_reward": 1.0, + "step": 1165 + }, + { + "completion_length": 75.375, + "epoch": 0.2915, + "grad_norm": 0.994990348815918, + "kl": 0.09153227508068085, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0037, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1166 + }, + { + "completion_length": 109.625, + "epoch": 0.29175, + "grad_norm": 0.021669652312994003, + "kl": 0.04759254679083824, + "learning_rate": 4.460588031360191e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1167 + }, + { + "completion_length": 93.875, + "epoch": 0.292, + "grad_norm": 0.8044827580451965, + "kl": 0.03482041880488396, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0014, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1168 + }, + { + "completion_length": 166.125, + "epoch": 0.29225, + "grad_norm": 0.6489118933677673, + "kl": 0.06921645253896713, + "learning_rate": 4.457877763227361e-06, + "loss": 0.0028, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 1.0, + "step": 1169 + }, + { + "completion_length": 168.75, + "epoch": 0.2925, + "grad_norm": 0.5671146512031555, + "kl": 0.06925438344478607, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0028, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.518750011920929, + "rewards/_format_reward": 0.875, + "step": 1170 + }, + { + "completion_length": 176.5, + "epoch": 0.29275, + "grad_norm": 0.7186687588691711, + "kl": 0.08284782618284225, + "learning_rate": 4.45516153105932e-06, + "loss": 0.0033, + "reward": 1.28125, + "reward_std": 0.44395747780799866, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 0.875, + "step": 1171 + }, + { + "completion_length": 140.0, + "epoch": 0.293, + "grad_norm": 0.5549473762512207, + "kl": 0.05967408046126366, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0024, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1172 + }, + { + "completion_length": 160.625, + "epoch": 0.29325, + "grad_norm": 0.593147337436676, + "kl": 0.062421780079603195, + "learning_rate": 4.452439343130183e-06, + "loss": 0.0025, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1173 + }, + { + "completion_length": 135.125, + "epoch": 0.2935, + "grad_norm": 0.03139398992061615, + "kl": 0.05725998803973198, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1174 + }, + { + "completion_length": 136.875, + "epoch": 0.29375, + "grad_norm": 0.035139165818691254, + "kl": 0.08669183403253555, + "learning_rate": 4.4497112077322045e-06, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1175 + }, + { + "completion_length": 93.125, + "epoch": 0.294, + "grad_norm": 0.03356340900063515, + "kl": 0.10462416708469391, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0042, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1176 + }, + { + "completion_length": 174.5, + "epoch": 0.29425, + "grad_norm": 0.6406842470169067, + "kl": 0.10782821476459503, + "learning_rate": 4.446977133175761e-06, + "loss": 0.0043, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 1177 + }, + { + "completion_length": 180.125, + "epoch": 0.2945, + "grad_norm": 0.6326098442077637, + "kl": 0.0835682675242424, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0033, + "reward": 1.1437499523162842, + "reward_std": 0.8317097425460815, + "rewards/_accuracy_reward": 0.39374998211860657, + "rewards/_format_reward": 0.75, + "step": 1178 + }, + { + "completion_length": 96.625, + "epoch": 0.29475, + "grad_norm": 0.7805556058883667, + "kl": 0.03996328264474869, + "learning_rate": 4.444237127789315e-06, + "loss": 0.0016, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 1.0, + "step": 1179 + }, + { + "completion_length": 133.375, + "epoch": 0.295, + "grad_norm": 0.02025960385799408, + "kl": 0.059164561331272125, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1180 + }, + { + "completion_length": 147.875, + "epoch": 0.29525, + "grad_norm": 0.03799540922045708, + "kl": 0.06712915748357773, + "learning_rate": 4.4414911999194e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1181 + }, + { + "completion_length": 164.25, + "epoch": 0.2955, + "grad_norm": 0.6490309238433838, + "kl": 0.07048535346984863, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0028, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1182 + }, + { + "completion_length": 136.0, + "epoch": 0.29575, + "grad_norm": 0.6978442072868347, + "kl": 0.06952231377363205, + "learning_rate": 4.438739357930587e-06, + "loss": 0.0028, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1183 + }, + { + "completion_length": 98.125, + "epoch": 0.296, + "grad_norm": 1.5878691673278809, + "kl": 0.04807111620903015, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0019, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1184 + }, + { + "completion_length": 166.0, + "epoch": 0.29625, + "grad_norm": 0.5427411198616028, + "kl": 0.07635194063186646, + "learning_rate": 4.435981610205464e-06, + "loss": 0.0031, + "reward": 0.9699999094009399, + "reward_std": 0.3982820212841034, + "rewards/_accuracy_reward": 0.0949999988079071, + "rewards/_format_reward": 0.875, + "step": 1185 + }, + { + "completion_length": 140.625, + "epoch": 0.2965, + "grad_norm": 0.6156898140907288, + "kl": 0.06197688356041908, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0025, + "reward": 1.5199999809265137, + "reward_std": 0.5133086442947388, + "rewards/_accuracy_reward": 0.5199999809265137, + "rewards/_format_reward": 1.0, + "step": 1186 + }, + { + "completion_length": 144.0, + "epoch": 0.29675, + "grad_norm": 0.6569059491157532, + "kl": 0.05045042932033539, + "learning_rate": 4.4332179651446106e-06, + "loss": 0.002, + "reward": 1.6325000524520874, + "reward_std": 0.507395327091217, + "rewards/_accuracy_reward": 0.7574999928474426, + "rewards/_format_reward": 0.875, + "step": 1187 + }, + { + "completion_length": 164.5, + "epoch": 0.297, + "grad_norm": 0.7192619442939758, + "kl": 0.07004435360431671, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0028, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1188 + }, + { + "completion_length": 160.125, + "epoch": 0.29725, + "grad_norm": 0.7192215323448181, + "kl": 0.05854785814881325, + "learning_rate": 4.430448431166567e-06, + "loss": 0.0023, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.40625, + "rewards/_format_reward": 1.0, + "step": 1189 + }, + { + "completion_length": 148.375, + "epoch": 0.2975, + "grad_norm": 0.6885538101196289, + "kl": 0.07502228021621704, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.003, + "reward": 1.5499999523162842, + "reward_std": 0.4855041801929474, + "rewards/_accuracy_reward": 0.5499999523162842, + "rewards/_format_reward": 1.0, + "step": 1190 + }, + { + "completion_length": 93.125, + "epoch": 0.29775, + "grad_norm": 0.816205620765686, + "kl": 0.08329864591360092, + "learning_rate": 4.427673016707817e-06, + "loss": 0.0033, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1191 + }, + { + "completion_length": 160.25, + "epoch": 0.298, + "grad_norm": 0.7627013921737671, + "kl": 0.0577714703977108, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0023, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1192 + }, + { + "completion_length": 135.75, + "epoch": 0.29825, + "grad_norm": 0.6774864196777344, + "kl": 0.05635792762041092, + "learning_rate": 4.424891730222749e-06, + "loss": 0.0023, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1193 + }, + { + "completion_length": 113.625, + "epoch": 0.2985, + "grad_norm": 0.7691601514816284, + "kl": 0.0638766884803772, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0026, + "reward": 1.4375, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.4375, + "rewards/_format_reward": 1.0, + "step": 1194 + }, + { + "completion_length": 115.0, + "epoch": 0.29875, + "grad_norm": 0.9962632060050964, + "kl": 0.09802590310573578, + "learning_rate": 4.422104580183649e-06, + "loss": 0.0039, + "reward": 1.7575000524520874, + "reward_std": 0.449150025844574, + "rewards/_accuracy_reward": 0.7575000524520874, + "rewards/_format_reward": 1.0, + "step": 1195 + }, + { + "completion_length": 132.125, + "epoch": 0.299, + "grad_norm": 0.8800735473632812, + "kl": 0.05530092492699623, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0022, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1196 + }, + { + "completion_length": 129.0, + "epoch": 0.29925, + "grad_norm": 0.02906613051891327, + "kl": 0.08473115414381027, + "learning_rate": 4.419311575080657e-06, + "loss": 0.0034, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1197 + }, + { + "completion_length": 194.375, + "epoch": 0.2995, + "grad_norm": 0.5856153964996338, + "kl": 0.06350870430469513, + "learning_rate": 4.41791287953825e-06, + "loss": 0.0025, + "reward": 1.431249976158142, + "reward_std": 0.47579821944236755, + "rewards/_accuracy_reward": 0.4312500059604645, + "rewards/_format_reward": 1.0, + "step": 1198 + }, + { + "completion_length": 159.0, + "epoch": 0.29975, + "grad_norm": 0.03134987875819206, + "kl": 0.06507380306720734, + "learning_rate": 4.416512723421752e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1199 + }, + { + "completion_length": 124.125, + "epoch": 0.3, + "grad_norm": 0.0544891431927681, + "kl": 0.07167188823223114, + "learning_rate": 4.415111107797445e-06, + "loss": 0.0029, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1200 + }, + { + "completion_length": 140.25, + "epoch": 0.30025, + "grad_norm": 0.032767925411462784, + "kl": 0.07854177057743073, + "learning_rate": 4.413708033732721e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1201 + }, + { + "completion_length": 97.0, + "epoch": 0.3005, + "grad_norm": 0.02442978322505951, + "kl": 0.03377045691013336, + "learning_rate": 4.412303502296081e-06, + "loss": 0.0014, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1202 + }, + { + "completion_length": 156.375, + "epoch": 0.30075, + "grad_norm": 0.8720740079879761, + "kl": 0.06999517232179642, + "learning_rate": 4.410897514557134e-06, + "loss": 0.0028, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1203 + }, + { + "completion_length": 161.125, + "epoch": 0.301, + "grad_norm": 0.6562350988388062, + "kl": 0.05543859675526619, + "learning_rate": 4.409490071586606e-06, + "loss": 0.0022, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 1204 + }, + { + "completion_length": 158.75, + "epoch": 0.30125, + "grad_norm": 0.7255245447158813, + "kl": 0.06354228407144547, + "learning_rate": 4.408081174456322e-06, + "loss": 0.0025, + "reward": 1.3512500524520874, + "reward_std": 0.6214140057563782, + "rewards/_accuracy_reward": 0.4762499928474426, + "rewards/_format_reward": 0.875, + "step": 1205 + }, + { + "completion_length": 152.625, + "epoch": 0.3015, + "grad_norm": 0.9874303936958313, + "kl": 0.0944613516330719, + "learning_rate": 4.406670824239221e-06, + "loss": 0.0038, + "reward": 1.600000023841858, + "reward_std": 0.43260011076927185, + "rewards/_accuracy_reward": 0.6000000238418579, + "rewards/_format_reward": 1.0, + "step": 1206 + }, + { + "completion_length": 159.625, + "epoch": 0.30175, + "grad_norm": 0.12287536263465881, + "kl": 0.07532623410224915, + "learning_rate": 4.405259022009345e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1207 + }, + { + "completion_length": 164.625, + "epoch": 0.302, + "grad_norm": 0.6047512888908386, + "kl": 0.042720258235931396, + "learning_rate": 4.403845768841842e-06, + "loss": 0.0017, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1208 + }, + { + "completion_length": 147.375, + "epoch": 0.30225, + "grad_norm": 0.718567967414856, + "kl": 0.06836355477571487, + "learning_rate": 4.402431065812968e-06, + "loss": 0.0027, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1209 + }, + { + "completion_length": 133.125, + "epoch": 0.3025, + "grad_norm": 0.051449116319417953, + "kl": 0.09786742180585861, + "learning_rate": 4.401014914000078e-06, + "loss": 0.0039, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1210 + }, + { + "completion_length": 155.0, + "epoch": 0.30275, + "grad_norm": 0.6808714270591736, + "kl": 0.06858290731906891, + "learning_rate": 4.399597314481635e-06, + "loss": 0.0027, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.875, + "step": 1211 + }, + { + "completion_length": 103.625, + "epoch": 0.303, + "grad_norm": 0.7550353407859802, + "kl": 0.07095042616128922, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0028, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1212 + }, + { + "completion_length": 157.75, + "epoch": 0.30325, + "grad_norm": 0.03089936450123787, + "kl": 0.07108917087316513, + "learning_rate": 4.396757776647446e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1213 + }, + { + "completion_length": 125.0, + "epoch": 0.3035, + "grad_norm": 0.6539469957351685, + "kl": 0.05641159415245056, + "learning_rate": 4.395335840494131e-06, + "loss": 0.0023, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1214 + }, + { + "completion_length": 131.5, + "epoch": 0.30375, + "grad_norm": 0.09180185198783875, + "kl": 0.0818408653140068, + "learning_rate": 4.393912460960125e-06, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1215 + }, + { + "completion_length": 117.0, + "epoch": 0.304, + "grad_norm": 1.0178251266479492, + "kl": 0.14222905039787292, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0057, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1216 + }, + { + "completion_length": 176.5, + "epoch": 0.30425, + "grad_norm": 0.5800394415855408, + "kl": 0.07087651640176773, + "learning_rate": 4.391061376086996e-06, + "loss": 0.0028, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1217 + }, + { + "completion_length": 153.875, + "epoch": 0.3045, + "grad_norm": 0.022167326882481575, + "kl": 0.04559296742081642, + "learning_rate": 4.389633672919099e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1218 + }, + { + "completion_length": 143.5, + "epoch": 0.30475, + "grad_norm": 0.6811370253562927, + "kl": 0.05372779816389084, + "learning_rate": 4.388204530712959e-06, + "loss": 0.0021, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1219 + }, + { + "completion_length": 159.5, + "epoch": 0.305, + "grad_norm": 0.6724026203155518, + "kl": 0.06683686375617981, + "learning_rate": 4.386773950556931e-06, + "loss": 0.0027, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1220 + }, + { + "completion_length": 190.5, + "epoch": 0.30525, + "grad_norm": 0.4614053964614868, + "kl": 0.0702865943312645, + "learning_rate": 4.385341933540461e-06, + "loss": 0.0028, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1221 + }, + { + "completion_length": 199.875, + "epoch": 0.3055, + "grad_norm": 0.4716734290122986, + "kl": 0.06874912232160568, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.0027, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1222 + }, + { + "completion_length": 116.5, + "epoch": 0.30575, + "grad_norm": 0.662601113319397, + "kl": 0.047971662133932114, + "learning_rate": 4.3824735932894695e-06, + "loss": 0.0019, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1223 + }, + { + "completion_length": 95.625, + "epoch": 0.306, + "grad_norm": 0.805237352848053, + "kl": 0.037139102816581726, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0015, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1224 + }, + { + "completion_length": 182.875, + "epoch": 0.30625, + "grad_norm": 0.480247437953949, + "kl": 0.06881922483444214, + "learning_rate": 4.379599518697444e-06, + "loss": 0.0028, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1225 + }, + { + "completion_length": 81.0, + "epoch": 0.3065, + "grad_norm": 0.9215694665908813, + "kl": 0.08191632479429245, + "learning_rate": 4.378160333758779e-06, + "loss": 0.0033, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 1226 + }, + { + "completion_length": 178.875, + "epoch": 0.30675, + "grad_norm": 0.04747424274682999, + "kl": 0.08109784126281738, + "learning_rate": 4.3767197185193164e-06, + "loss": 0.0032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1227 + }, + { + "completion_length": 166.25, + "epoch": 0.307, + "grad_norm": 0.45579221844673157, + "kl": 0.04330654814839363, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0017, + "reward": 1.631250023841858, + "reward_std": 0.7382108569145203, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 1228 + }, + { + "completion_length": 167.625, + "epoch": 0.30725, + "grad_norm": 0.5388981699943542, + "kl": 0.044501304626464844, + "learning_rate": 4.373834201527457e-06, + "loss": 0.0018, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.875, + "step": 1229 + }, + { + "completion_length": 142.375, + "epoch": 0.3075, + "grad_norm": 0.04394035413861275, + "kl": 0.06808657199144363, + "learning_rate": 4.372389301972506e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1230 + }, + { + "completion_length": 145.625, + "epoch": 0.30775, + "grad_norm": 0.036082785576581955, + "kl": 0.06343095749616623, + "learning_rate": 4.370942976511651e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1231 + }, + { + "completion_length": 109.875, + "epoch": 0.308, + "grad_norm": 0.6433164477348328, + "kl": 0.05788550525903702, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0023, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1232 + }, + { + "completion_length": 137.875, + "epoch": 0.30825, + "grad_norm": 0.6208747625350952, + "kl": 0.04600555822253227, + "learning_rate": 4.36804605227907e-06, + "loss": 0.0018, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1233 + }, + { + "completion_length": 134.125, + "epoch": 0.3085, + "grad_norm": 0.0351361520588398, + "kl": 0.07358551025390625, + "learning_rate": 4.366595455713479e-06, + "loss": 0.0029, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1234 + }, + { + "completion_length": 83.75, + "epoch": 0.30875, + "grad_norm": 1.0054473876953125, + "kl": 0.03810626268386841, + "learning_rate": 4.365143437654249e-06, + "loss": 0.0015, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1235 + }, + { + "completion_length": 203.75, + "epoch": 0.309, + "grad_norm": 0.4629731774330139, + "kl": 0.044374044984579086, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0018, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1236 + }, + { + "completion_length": 159.375, + "epoch": 0.30925, + "grad_norm": 0.017106125131249428, + "kl": 0.03975909203290939, + "learning_rate": 4.362235141479055e-06, + "loss": 0.0016, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1237 + }, + { + "completion_length": 151.75, + "epoch": 0.3095, + "grad_norm": 0.4982658326625824, + "kl": 0.056246038526296616, + "learning_rate": 4.360778865577885e-06, + "loss": 0.0022, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1238 + }, + { + "completion_length": 155.75, + "epoch": 0.30975, + "grad_norm": 0.7274753451347351, + "kl": 0.05836237221956253, + "learning_rate": 4.359321172612664e-06, + "loss": 0.0023, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 1.0, + "step": 1239 + }, + { + "completion_length": 119.375, + "epoch": 0.31, + "grad_norm": 0.7453700304031372, + "kl": 0.05975281819701195, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0024, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1240 + }, + { + "completion_length": 136.75, + "epoch": 0.31025, + "grad_norm": 0.6040889620780945, + "kl": 0.06652972102165222, + "learning_rate": 4.356401539931528e-06, + "loss": 0.0027, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1241 + }, + { + "completion_length": 173.25, + "epoch": 0.3105, + "grad_norm": 0.44152548909187317, + "kl": 0.04805905371904373, + "learning_rate": 4.354939602439041e-06, + "loss": 0.0019, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1242 + }, + { + "completion_length": 158.5, + "epoch": 0.31075, + "grad_norm": 0.021656127646565437, + "kl": 0.05573081597685814, + "learning_rate": 4.353476252329356e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1243 + }, + { + "completion_length": 117.5, + "epoch": 0.311, + "grad_norm": 0.023949656635522842, + "kl": 0.057754624634981155, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1244 + }, + { + "completion_length": 166.375, + "epoch": 0.31125, + "grad_norm": 0.7155207395553589, + "kl": 0.045125432312488556, + "learning_rate": 4.350545318717081e-06, + "loss": 0.0018, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.6375000476837158, + "rewards/_format_reward": 0.875, + "step": 1245 + }, + { + "completion_length": 131.0, + "epoch": 0.3115, + "grad_norm": 0.5737600922584534, + "kl": 0.035169921815395355, + "learning_rate": 4.349077737446525e-06, + "loss": 0.0014, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1246 + }, + { + "completion_length": 192.75, + "epoch": 0.31175, + "grad_norm": 0.5308739542961121, + "kl": 0.04862486571073532, + "learning_rate": 4.347608748022835e-06, + "loss": 0.0019, + "reward": 1.2774999141693115, + "reward_std": 0.44627827405929565, + "rewards/_accuracy_reward": 0.2774999737739563, + "rewards/_format_reward": 1.0, + "step": 1247 + }, + { + "completion_length": 128.5, + "epoch": 0.312, + "grad_norm": 0.8783921003341675, + "kl": 0.0623704232275486, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0025, + "reward": 1.401249885559082, + "reward_std": 0.49599650502204895, + "rewards/_accuracy_reward": 0.4012499749660492, + "rewards/_format_reward": 1.0, + "step": 1248 + }, + { + "completion_length": 170.5, + "epoch": 0.31225, + "grad_norm": 0.6559812426567078, + "kl": 0.05016009509563446, + "learning_rate": 4.344666549191921e-06, + "loss": 0.002, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1249 + }, + { + "completion_length": 194.25, + "epoch": 0.3125, + "grad_norm": 0.4926063120365143, + "kl": 0.037470243871212006, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0015, + "reward": 1.256250023841858, + "reward_std": 0.8845650553703308, + "rewards/_accuracy_reward": 0.6312500238418579, + "rewards/_format_reward": 0.625, + "step": 1250 + }, + { + "completion_length": 163.25, + "epoch": 0.31275, + "grad_norm": 0.6643059253692627, + "kl": 0.0781041607260704, + "learning_rate": 4.341718731186788e-06, + "loss": 0.0031, + "reward": 1.5187499523162842, + "reward_std": 0.7323824167251587, + "rewards/_accuracy_reward": 0.643750011920929, + "rewards/_format_reward": 0.875, + "step": 1251 + }, + { + "completion_length": 168.625, + "epoch": 0.313, + "grad_norm": 0.5912598967552185, + "kl": 0.0778733566403389, + "learning_rate": 4.340242717799337e-06, + "loss": 0.0031, + "reward": 1.3875000476837158, + "reward_std": 0.7371518611907959, + "rewards/_accuracy_reward": 0.6375000476837158, + "rewards/_format_reward": 0.75, + "step": 1252 + }, + { + "completion_length": 148.0, + "epoch": 0.31325, + "grad_norm": 0.022581512108445168, + "kl": 0.06389284133911133, + "learning_rate": 4.338765302987001e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1253 + }, + { + "completion_length": 123.625, + "epoch": 0.3135, + "grad_norm": 0.023221751675009727, + "kl": 0.05139881372451782, + "learning_rate": 4.3372864878749e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1254 + }, + { + "completion_length": 175.0, + "epoch": 0.31375, + "grad_norm": 0.5853912830352783, + "kl": 0.05094706267118454, + "learning_rate": 4.335806273589214e-06, + "loss": 0.002, + "reward": 1.5012500286102295, + "reward_std": 0.9235093593597412, + "rewards/_accuracy_reward": 0.7512500286102295, + "rewards/_format_reward": 0.75, + "step": 1255 + }, + { + "completion_length": 187.0, + "epoch": 0.314, + "grad_norm": 0.4903429448604584, + "kl": 0.05913626775145531, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0024, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1256 + }, + { + "completion_length": 158.75, + "epoch": 0.31425, + "grad_norm": 0.5297297239303589, + "kl": 0.03816875442862511, + "learning_rate": 4.332841652007144e-06, + "loss": 0.0015, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1257 + }, + { + "completion_length": 121.125, + "epoch": 0.3145, + "grad_norm": 0.8008362650871277, + "kl": 0.055129993706941605, + "learning_rate": 4.331357246968447e-06, + "loss": 0.0022, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 1258 + }, + { + "completion_length": 179.0, + "epoch": 0.31475, + "grad_norm": 0.020386753603816032, + "kl": 0.04700789228081703, + "learning_rate": 4.329871447271541e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1259 + }, + { + "completion_length": 122.875, + "epoch": 0.315, + "grad_norm": 0.7078530192375183, + "kl": 0.03913933411240578, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0016, + "reward": 1.6687500476837158, + "reward_std": 0.4613160789012909, + "rewards/_accuracy_reward": 0.668749988079071, + "rewards/_format_reward": 1.0, + "step": 1260 + }, + { + "completion_length": 102.625, + "epoch": 0.31525, + "grad_norm": 0.8336498141288757, + "kl": 0.09925613552331924, + "learning_rate": 4.326895668430166e-06, + "loss": 0.004, + "reward": 1.5012500286102295, + "reward_std": 0.42089828848838806, + "rewards/_accuracy_reward": 0.5012500286102295, + "rewards/_format_reward": 1.0, + "step": 1261 + }, + { + "completion_length": 163.5, + "epoch": 0.3155, + "grad_norm": 0.6298437118530273, + "kl": 0.06484002619981766, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.0026, + "reward": 1.631250023841858, + "reward_std": 0.7382108569145203, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 1262 + }, + { + "completion_length": 180.0, + "epoch": 0.31575, + "grad_norm": 0.6922435164451599, + "kl": 0.06569919735193253, + "learning_rate": 4.323914324547755e-06, + "loss": 0.0026, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1263 + }, + { + "completion_length": 120.625, + "epoch": 0.316, + "grad_norm": 0.8579962253570557, + "kl": 0.06102241203188896, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0024, + "reward": 1.6687500476837158, + "reward_std": 0.4613160789012909, + "rewards/_accuracy_reward": 0.6687500476837158, + "rewards/_format_reward": 1.0, + "step": 1264 + }, + { + "completion_length": 148.25, + "epoch": 0.31625, + "grad_norm": 0.5821033120155334, + "kl": 0.04418959096074104, + "learning_rate": 4.320927424706001e-06, + "loss": 0.0018, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1265 + }, + { + "completion_length": 171.25, + "epoch": 0.3165, + "grad_norm": 0.6433318853378296, + "kl": 0.05075891688466072, + "learning_rate": 4.319431894143027e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1266 + }, + { + "completion_length": 142.125, + "epoch": 0.31675, + "grad_norm": 0.5217633247375488, + "kl": 0.03551534563302994, + "learning_rate": 4.317934978003517e-06, + "loss": 0.0014, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1267 + }, + { + "completion_length": 146.25, + "epoch": 0.317, + "grad_norm": 0.6079625487327576, + "kl": 0.03397079184651375, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0014, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1268 + }, + { + "completion_length": 161.625, + "epoch": 0.31725, + "grad_norm": 0.6995141506195068, + "kl": 0.031033797189593315, + "learning_rate": 4.314936993555816e-06, + "loss": 0.0012, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1269 + }, + { + "completion_length": 173.25, + "epoch": 0.3175, + "grad_norm": 0.593582808971405, + "kl": 0.048646219074726105, + "learning_rate": 4.313435927530719e-06, + "loss": 0.0019, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1270 + }, + { + "completion_length": 154.125, + "epoch": 0.31775, + "grad_norm": 0.7062419056892395, + "kl": 0.04136907681822777, + "learning_rate": 4.311933480495278e-06, + "loss": 0.0017, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1271 + }, + { + "completion_length": 159.75, + "epoch": 0.318, + "grad_norm": 0.4106045663356781, + "kl": 0.057584941387176514, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1272 + }, + { + "completion_length": 132.5, + "epoch": 0.31825, + "grad_norm": 0.06259723007678986, + "kl": 0.07464718818664551, + "learning_rate": 4.308924447971123e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1273 + }, + { + "completion_length": 143.125, + "epoch": 0.3185, + "grad_norm": 0.025278618559241295, + "kl": 0.04770468547940254, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1274 + }, + { + "completion_length": 153.375, + "epoch": 0.31875, + "grad_norm": 0.6714683771133423, + "kl": 0.0832626223564148, + "learning_rate": 4.305909905149389e-06, + "loss": 0.0033, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 1275 + }, + { + "completion_length": 152.0, + "epoch": 0.319, + "grad_norm": 0.03329375758767128, + "kl": 0.06376402080059052, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1276 + }, + { + "completion_length": 150.125, + "epoch": 0.31925, + "grad_norm": 0.0274689681828022, + "kl": 0.0662902370095253, + "learning_rate": 4.302889861212894e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1277 + }, + { + "completion_length": 151.0, + "epoch": 0.3195, + "grad_norm": 0.5757763981819153, + "kl": 0.0469672717154026, + "learning_rate": 4.301377779200826e-06, + "loss": 0.0019, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1278 + }, + { + "completion_length": 106.25, + "epoch": 0.31975, + "grad_norm": 0.015396623872220516, + "kl": 0.08767110854387283, + "learning_rate": 4.299864325361217e-06, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1279 + }, + { + "completion_length": 147.75, + "epoch": 0.32, + "grad_norm": 1.0080978870391846, + "kl": 0.050190072506666183, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.002, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1280 + }, + { + "completion_length": 139.625, + "epoch": 0.32025, + "grad_norm": 0.04891718551516533, + "kl": 0.08530929684638977, + "learning_rate": 4.2968333068106635e-06, + "loss": 0.0034, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1281 + }, + { + "completion_length": 175.25, + "epoch": 0.3205, + "grad_norm": 0.6860036253929138, + "kl": 0.09078608453273773, + "learning_rate": 4.295315744407972e-06, + "loss": 0.0036, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1282 + }, + { + "completion_length": 172.75, + "epoch": 0.32075, + "grad_norm": 0.03762355074286461, + "kl": 0.05991184711456299, + "learning_rate": 4.293796814794243e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1283 + }, + { + "completion_length": 176.375, + "epoch": 0.321, + "grad_norm": 0.5721231698989868, + "kl": 0.05818440765142441, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0023, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1284 + }, + { + "completion_length": 133.125, + "epoch": 0.32125, + "grad_norm": 0.7689949870109558, + "kl": 0.0632908046245575, + "learning_rate": 4.290754858561636e-06, + "loss": 0.0025, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1285 + }, + { + "completion_length": 163.75, + "epoch": 0.3215, + "grad_norm": 0.44197624921798706, + "kl": 0.06356453895568848, + "learning_rate": 4.28923183425934e-06, + "loss": 0.0025, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1286 + }, + { + "completion_length": 109.75, + "epoch": 0.32175, + "grad_norm": 0.8185544610023499, + "kl": 0.07203707098960876, + "learning_rate": 4.287707447379169e-06, + "loss": 0.0029, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1287 + }, + { + "completion_length": 157.75, + "epoch": 0.322, + "grad_norm": 0.8443469405174255, + "kl": 0.06975241750478745, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0028, + "reward": 1.337499976158142, + "reward_std": 0.41811659932136536, + "rewards/_accuracy_reward": 0.3374999761581421, + "rewards/_format_reward": 1.0, + "step": 1288 + }, + { + "completion_length": 99.25, + "epoch": 0.32225, + "grad_norm": 0.017100084573030472, + "kl": 0.061960458755493164, + "learning_rate": 4.284654590529784e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1289 + }, + { + "completion_length": 148.5, + "epoch": 0.3225, + "grad_norm": 0.7848110198974609, + "kl": 0.05555134639143944, + "learning_rate": 4.283126122885455e-06, + "loss": 0.0022, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1290 + }, + { + "completion_length": 117.375, + "epoch": 0.32275, + "grad_norm": 0.7877644896507263, + "kl": 0.09985921531915665, + "learning_rate": 4.281596297313014e-06, + "loss": 0.004, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 1291 + }, + { + "completion_length": 154.25, + "epoch": 0.323, + "grad_norm": 0.030576931312680244, + "kl": 0.05328349769115448, + "learning_rate": 4.280065114977492e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1292 + }, + { + "completion_length": 120.5, + "epoch": 0.32325, + "grad_norm": 0.12656356394290924, + "kl": 0.08763141185045242, + "learning_rate": 4.278532577044949e-06, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1293 + }, + { + "completion_length": 171.25, + "epoch": 0.3235, + "grad_norm": 0.6466588377952576, + "kl": 0.06330376863479614, + "learning_rate": 4.276998684682482e-06, + "loss": 0.0025, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1294 + }, + { + "completion_length": 114.25, + "epoch": 0.32375, + "grad_norm": 0.03600083664059639, + "kl": 0.07176019996404648, + "learning_rate": 4.275463439058214e-06, + "loss": 0.0029, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1295 + }, + { + "completion_length": 181.5, + "epoch": 0.324, + "grad_norm": 0.6595420837402344, + "kl": 0.0831797644495964, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0033, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 1296 + }, + { + "completion_length": 171.125, + "epoch": 0.32425, + "grad_norm": 0.037904031574726105, + "kl": 0.06669317185878754, + "learning_rate": 4.272388892701934e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1297 + }, + { + "completion_length": 173.0, + "epoch": 0.3245, + "grad_norm": 0.7312850952148438, + "kl": 0.05984557047486305, + "learning_rate": 4.270849594311323e-06, + "loss": 0.0024, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.75, + "step": 1298 + }, + { + "completion_length": 161.125, + "epoch": 0.32475, + "grad_norm": 0.6081017255783081, + "kl": 0.04914931207895279, + "learning_rate": 4.269308947341711e-06, + "loss": 0.002, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 1299 + }, + { + "completion_length": 167.0, + "epoch": 0.325, + "grad_norm": 0.6635521650314331, + "kl": 0.06111575663089752, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0024, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1300 + }, + { + "completion_length": 176.5, + "epoch": 0.32525, + "grad_norm": 0.042982131242752075, + "kl": 0.06564896553754807, + "learning_rate": 4.266223612359593e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1301 + }, + { + "completion_length": 163.625, + "epoch": 0.3255, + "grad_norm": 0.638775110244751, + "kl": 0.057519882917404175, + "learning_rate": 4.264678926696703e-06, + "loss": 0.0023, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1302 + }, + { + "completion_length": 137.5, + "epoch": 0.32575, + "grad_norm": 0.7242026329040527, + "kl": 0.057923607528209686, + "learning_rate": 4.263132897154044e-06, + "loss": 0.0023, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1303 + }, + { + "completion_length": 119.0, + "epoch": 0.326, + "grad_norm": 0.5665600895881653, + "kl": 0.052210718393325806, + "learning_rate": 4.261585524908987e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1304 + }, + { + "completion_length": 184.0, + "epoch": 0.32625, + "grad_norm": 0.6595292091369629, + "kl": 0.07032403349876404, + "learning_rate": 4.260036811139922e-06, + "loss": 0.0028, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1305 + }, + { + "completion_length": 185.625, + "epoch": 0.3265, + "grad_norm": 0.031764958053827286, + "kl": 0.05210375413298607, + "learning_rate": 4.25848675702626e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1306 + }, + { + "completion_length": 198.5, + "epoch": 0.32675, + "grad_norm": 0.5040040016174316, + "kl": 0.04720192775130272, + "learning_rate": 4.256935363748437e-06, + "loss": 0.0019, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.75, + "step": 1307 + }, + { + "completion_length": 216.25, + "epoch": 0.327, + "grad_norm": 0.43086105585098267, + "kl": 0.04231351986527443, + "learning_rate": 4.255382632487907e-06, + "loss": 0.0017, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 1308 + }, + { + "completion_length": 133.375, + "epoch": 0.32725, + "grad_norm": 0.6797394752502441, + "kl": 0.04834214597940445, + "learning_rate": 4.25382856442714e-06, + "loss": 0.0019, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1309 + }, + { + "completion_length": 132.375, + "epoch": 0.3275, + "grad_norm": 0.6795439720153809, + "kl": 0.0568021684885025, + "learning_rate": 4.2522731607496275e-06, + "loss": 0.0023, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1310 + }, + { + "completion_length": 168.625, + "epoch": 0.32775, + "grad_norm": 0.6751994490623474, + "kl": 0.0991804301738739, + "learning_rate": 4.250716422639878e-06, + "loss": 0.004, + "reward": 1.212499976158142, + "reward_std": 0.3324691653251648, + "rewards/_accuracy_reward": 0.3374999761581421, + "rewards/_format_reward": 0.875, + "step": 1311 + }, + { + "completion_length": 186.625, + "epoch": 0.328, + "grad_norm": 0.7776236534118652, + "kl": 0.042616959661245346, + "learning_rate": 4.249158351283414e-06, + "loss": 0.0017, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1312 + }, + { + "completion_length": 133.25, + "epoch": 0.32825, + "grad_norm": 0.02199604921042919, + "kl": 0.05136921629309654, + "learning_rate": 4.247598947866775e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1313 + }, + { + "completion_length": 164.875, + "epoch": 0.3285, + "grad_norm": 0.5183839797973633, + "kl": 0.06551958620548248, + "learning_rate": 4.246038213577516e-06, + "loss": 0.0026, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1314 + }, + { + "completion_length": 170.875, + "epoch": 0.32875, + "grad_norm": 0.5866003632545471, + "kl": 0.05409818887710571, + "learning_rate": 4.244476149604201e-06, + "loss": 0.0022, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1315 + }, + { + "completion_length": 139.625, + "epoch": 0.329, + "grad_norm": 0.7940442562103271, + "kl": 0.05304113030433655, + "learning_rate": 4.242912757136412e-06, + "loss": 0.0021, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1316 + }, + { + "completion_length": 205.0, + "epoch": 0.32925, + "grad_norm": 0.5274433493614197, + "kl": 0.06083288788795471, + "learning_rate": 4.24134803736474e-06, + "loss": 0.0024, + "reward": 1.2625000476837158, + "reward_std": 0.8826704621315002, + "rewards/_accuracy_reward": 0.512499988079071, + "rewards/_format_reward": 0.75, + "step": 1317 + }, + { + "completion_length": 181.125, + "epoch": 0.3295, + "grad_norm": 0.5549687147140503, + "kl": 0.06591016054153442, + "learning_rate": 4.239781991480786e-06, + "loss": 0.0026, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1318 + }, + { + "completion_length": 132.75, + "epoch": 0.32975, + "grad_norm": 0.5778529047966003, + "kl": 0.06366316229104996, + "learning_rate": 4.238214620677164e-06, + "loss": 0.0025, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1319 + }, + { + "completion_length": 136.375, + "epoch": 0.33, + "grad_norm": 0.024910366162657738, + "kl": 0.04503370448946953, + "learning_rate": 4.236645926147493e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1320 + }, + { + "completion_length": 173.625, + "epoch": 0.33025, + "grad_norm": 0.5607073307037354, + "kl": 0.05769224464893341, + "learning_rate": 4.235075909086405e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1321 + }, + { + "completion_length": 100.625, + "epoch": 0.3305, + "grad_norm": 1.6146537065505981, + "kl": 0.4625369906425476, + "learning_rate": 4.233504570689533e-06, + "loss": 0.0185, + "reward": 1.693750023841858, + "reward_std": 0.42714792490005493, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 1.0, + "step": 1322 + }, + { + "completion_length": 171.125, + "epoch": 0.33075, + "grad_norm": 0.632230818271637, + "kl": 0.04798278212547302, + "learning_rate": 4.231931912153521e-06, + "loss": 0.0019, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 1323 + }, + { + "completion_length": 138.125, + "epoch": 0.331, + "grad_norm": 0.022344160825014114, + "kl": 0.04536424204707146, + "learning_rate": 4.230357934676017e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1324 + }, + { + "completion_length": 169.875, + "epoch": 0.33125, + "grad_norm": 0.5961106419563293, + "kl": 0.06459388136863708, + "learning_rate": 4.228782639455674e-06, + "loss": 0.0026, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1325 + }, + { + "completion_length": 145.875, + "epoch": 0.3315, + "grad_norm": 0.5319207906723022, + "kl": 0.04201117902994156, + "learning_rate": 4.227206027692146e-06, + "loss": 0.0017, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1326 + }, + { + "completion_length": 178.5, + "epoch": 0.33175, + "grad_norm": 0.607018768787384, + "kl": 0.04852335527539253, + "learning_rate": 4.225628100586093e-06, + "loss": 0.0019, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1327 + }, + { + "completion_length": 133.125, + "epoch": 0.332, + "grad_norm": 0.019281940534710884, + "kl": 0.0670793280005455, + "learning_rate": 4.224048859339175e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1328 + }, + { + "completion_length": 133.625, + "epoch": 0.33225, + "grad_norm": 0.7307944297790527, + "kl": 0.06218728423118591, + "learning_rate": 4.222468305154052e-06, + "loss": 0.0025, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1329 + }, + { + "completion_length": 157.625, + "epoch": 0.3325, + "grad_norm": 0.48206326365470886, + "kl": 0.04004458710551262, + "learning_rate": 4.220886439234385e-06, + "loss": 0.0016, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1330 + }, + { + "completion_length": 160.75, + "epoch": 0.33275, + "grad_norm": 0.036685239523649216, + "kl": 0.06175795570015907, + "learning_rate": 4.219303262784834e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1331 + }, + { + "completion_length": 153.25, + "epoch": 0.333, + "grad_norm": 0.6564381718635559, + "kl": 0.04746977239847183, + "learning_rate": 4.217718777011058e-06, + "loss": 0.0019, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1332 + }, + { + "completion_length": 139.5, + "epoch": 0.33325, + "grad_norm": 0.5996315479278564, + "kl": 0.04581526294350624, + "learning_rate": 4.2161329831197095e-06, + "loss": 0.0018, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 1333 + }, + { + "completion_length": 170.25, + "epoch": 0.3335, + "grad_norm": 0.6188631057739258, + "kl": 0.05417291074991226, + "learning_rate": 4.2145458823184414e-06, + "loss": 0.0022, + "reward": 1.274999976158142, + "reward_std": 0.6974443197250366, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.875, + "step": 1334 + }, + { + "completion_length": 148.0, + "epoch": 0.33375, + "grad_norm": 0.023063072934746742, + "kl": 0.04357193037867546, + "learning_rate": 4.212957475815898e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1335 + }, + { + "completion_length": 110.375, + "epoch": 0.334, + "grad_norm": 0.8400101661682129, + "kl": 0.05062510818243027, + "learning_rate": 4.211367764821722e-06, + "loss": 0.002, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1336 + }, + { + "completion_length": 174.0, + "epoch": 0.33425, + "grad_norm": 0.6004985570907593, + "kl": 0.03446627035737038, + "learning_rate": 4.209776750546547e-06, + "loss": 0.0014, + "reward": 1.78125, + "reward_std": 0.6187184453010559, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 1337 + }, + { + "completion_length": 151.125, + "epoch": 0.3345, + "grad_norm": 0.5468786954879761, + "kl": 0.06371209770441055, + "learning_rate": 4.208184434201999e-06, + "loss": 0.0025, + "reward": 1.1687499284744263, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.16875000298023224, + "rewards/_format_reward": 1.0, + "step": 1338 + }, + { + "completion_length": 172.375, + "epoch": 0.33475, + "grad_norm": 0.024628562852740288, + "kl": 0.04282301664352417, + "learning_rate": 4.206590817000695e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1339 + }, + { + "completion_length": 211.25, + "epoch": 0.335, + "grad_norm": 0.47119390964508057, + "kl": 0.06950601935386658, + "learning_rate": 4.204995900156247e-06, + "loss": 0.0028, + "reward": 1.6262500286102295, + "reward_std": 0.7428312301635742, + "rewards/_accuracy_reward": 0.7512500286102295, + "rewards/_format_reward": 0.875, + "step": 1340 + }, + { + "completion_length": 150.25, + "epoch": 0.33525, + "grad_norm": 0.08035016059875488, + "kl": 0.0573977455496788, + "learning_rate": 4.20339968488325e-06, + "loss": 0.0023, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 1341 + }, + { + "completion_length": 160.25, + "epoch": 0.3355, + "grad_norm": 0.7187206149101257, + "kl": 0.07497703284025192, + "learning_rate": 4.201802172397295e-06, + "loss": 0.003, + "reward": 1.1512500047683716, + "reward_std": 0.6355074048042297, + "rewards/_accuracy_reward": 0.2762500047683716, + "rewards/_format_reward": 0.875, + "step": 1342 + }, + { + "completion_length": 173.75, + "epoch": 0.33575, + "grad_norm": 0.5304349660873413, + "kl": 0.08109744638204575, + "learning_rate": 4.2002033639149545e-06, + "loss": 0.0032, + "reward": 1.537500023841858, + "reward_std": 0.7322909235954285, + "rewards/_accuracy_reward": 0.6625000238418579, + "rewards/_format_reward": 0.875, + "step": 1343 + }, + { + "completion_length": 176.875, + "epoch": 0.336, + "grad_norm": 0.6054671406745911, + "kl": 0.05511131510138512, + "learning_rate": 4.198603260653792e-06, + "loss": 0.0022, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1344 + }, + { + "completion_length": 162.75, + "epoch": 0.33625, + "grad_norm": 0.025135153904557228, + "kl": 0.05139942467212677, + "learning_rate": 4.197001863832355e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1345 + }, + { + "completion_length": 137.625, + "epoch": 0.3365, + "grad_norm": 0.6007516980171204, + "kl": 0.05004937946796417, + "learning_rate": 4.195399174670177e-06, + "loss": 0.002, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1346 + }, + { + "completion_length": 197.875, + "epoch": 0.33675, + "grad_norm": 0.5018278956413269, + "kl": 0.05320580676198006, + "learning_rate": 4.193795194387776e-06, + "loss": 0.0021, + "reward": 1.1437499523162842, + "reward_std": 0.8317097425460815, + "rewards/_accuracy_reward": 0.39375001192092896, + "rewards/_format_reward": 0.75, + "step": 1347 + }, + { + "completion_length": 96.375, + "epoch": 0.337, + "grad_norm": 0.7763251662254333, + "kl": 0.13074029982089996, + "learning_rate": 4.192189924206652e-06, + "loss": 0.0052, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 1348 + }, + { + "completion_length": 173.75, + "epoch": 0.33725, + "grad_norm": 0.021658629179000854, + "kl": 0.04604887589812279, + "learning_rate": 4.190583365349289e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1349 + }, + { + "completion_length": 183.25, + "epoch": 0.3375, + "grad_norm": 0.5501531958580017, + "kl": 0.05152320861816406, + "learning_rate": 4.188975519039151e-06, + "loss": 0.0021, + "reward": 1.274999976158142, + "reward_std": 0.6974443197250366, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.875, + "step": 1350 + }, + { + "completion_length": 216.5, + "epoch": 0.33775, + "grad_norm": 0.547321081161499, + "kl": 0.05665482208132744, + "learning_rate": 4.1873663865006835e-06, + "loss": 0.0023, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1351 + }, + { + "completion_length": 162.75, + "epoch": 0.338, + "grad_norm": 0.6683735847473145, + "kl": 0.043146222829818726, + "learning_rate": 4.185755968959308e-06, + "loss": 0.0017, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1352 + }, + { + "completion_length": 170.375, + "epoch": 0.33825, + "grad_norm": 0.6152597665786743, + "kl": 0.05195571482181549, + "learning_rate": 4.184144267641433e-06, + "loss": 0.0021, + "reward": 0.8124999403953552, + "reward_std": 0.5062113404273987, + "rewards/_accuracy_reward": 0.0625, + "rewards/_format_reward": 0.75, + "step": 1353 + }, + { + "completion_length": 150.125, + "epoch": 0.3385, + "grad_norm": 0.7096376419067383, + "kl": 0.054735492914915085, + "learning_rate": 4.182531283774434e-06, + "loss": 0.0022, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1354 + }, + { + "completion_length": 138.0, + "epoch": 0.33875, + "grad_norm": 0.024319298565387726, + "kl": 0.051283035427331924, + "learning_rate": 4.18091701858667e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1355 + }, + { + "completion_length": 108.25, + "epoch": 0.339, + "grad_norm": 0.8185098171234131, + "kl": 0.06744442135095596, + "learning_rate": 4.179301473307476e-06, + "loss": 0.0027, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1356 + }, + { + "completion_length": 154.25, + "epoch": 0.33925, + "grad_norm": 0.5948598384857178, + "kl": 0.059390176087617874, + "learning_rate": 4.177684649167158e-06, + "loss": 0.0024, + "reward": 1.7574999332427979, + "reward_std": 0.4491499960422516, + "rewards/_accuracy_reward": 0.7574999928474426, + "rewards/_format_reward": 1.0, + "step": 1357 + }, + { + "completion_length": 178.125, + "epoch": 0.3395, + "grad_norm": 0.023550184443593025, + "kl": 0.04499709606170654, + "learning_rate": 4.176066547396998e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1358 + }, + { + "completion_length": 153.375, + "epoch": 0.33975, + "grad_norm": 0.027199365198612213, + "kl": 0.05625481531023979, + "learning_rate": 4.174447169229252e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1359 + }, + { + "completion_length": 137.875, + "epoch": 0.34, + "grad_norm": 0.019031843170523643, + "kl": 0.051029808819293976, + "learning_rate": 4.172826515897146e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1360 + }, + { + "completion_length": 158.75, + "epoch": 0.34025, + "grad_norm": 0.03614223375916481, + "kl": 0.06285353004932404, + "learning_rate": 4.171204588634878e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1361 + }, + { + "completion_length": 122.625, + "epoch": 0.3405, + "grad_norm": 0.5458539128303528, + "kl": 0.06035559996962547, + "learning_rate": 4.169581388677617e-06, + "loss": 0.0024, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1362 + }, + { + "completion_length": 120.125, + "epoch": 0.34075, + "grad_norm": 0.029562877491116524, + "kl": 0.0810147076845169, + "learning_rate": 4.1679569172614994e-06, + "loss": 0.0032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1363 + }, + { + "completion_length": 188.125, + "epoch": 0.341, + "grad_norm": 0.6008427143096924, + "kl": 0.06007464975118637, + "learning_rate": 4.166331175623631e-06, + "loss": 0.0024, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1364 + }, + { + "completion_length": 129.875, + "epoch": 0.34125, + "grad_norm": 0.7642394304275513, + "kl": 0.07598597556352615, + "learning_rate": 4.164704165002086e-06, + "loss": 0.003, + "reward": 1.4075000286102295, + "reward_std": 0.3749571442604065, + "rewards/_accuracy_reward": 0.4074999988079071, + "rewards/_format_reward": 1.0, + "step": 1365 + }, + { + "completion_length": 156.5, + "epoch": 0.3415, + "grad_norm": 0.024682415649294853, + "kl": 0.049706555902957916, + "learning_rate": 4.163075886635902e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1366 + }, + { + "completion_length": 106.875, + "epoch": 0.34175, + "grad_norm": 0.024045802652835846, + "kl": 0.04866664111614227, + "learning_rate": 4.161446341765085e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1367 + }, + { + "completion_length": 156.25, + "epoch": 0.342, + "grad_norm": 0.6025788187980652, + "kl": 0.051358725875616074, + "learning_rate": 4.159815531630604e-06, + "loss": 0.0021, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1368 + }, + { + "completion_length": 155.625, + "epoch": 0.34225, + "grad_norm": 0.035298582166433334, + "kl": 0.05683750659227371, + "learning_rate": 4.158183457474392e-06, + "loss": 0.0023, + "reward": 1.0499999523162842, + "reward_std": 0.0, + "rewards/_accuracy_reward": 0.05000000074505806, + "rewards/_format_reward": 1.0, + "step": 1369 + }, + { + "completion_length": 145.5, + "epoch": 0.3425, + "grad_norm": 0.6779209971427917, + "kl": 0.08619740605354309, + "learning_rate": 4.1565501205393445e-06, + "loss": 0.0034, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1370 + }, + { + "completion_length": 147.25, + "epoch": 0.34275, + "grad_norm": 0.055118631571531296, + "kl": 0.07643330842256546, + "learning_rate": 4.154915522069318e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1371 + }, + { + "completion_length": 158.0, + "epoch": 0.343, + "grad_norm": 0.03090520389378071, + "kl": 0.042203597724437714, + "learning_rate": 4.15327966330913e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1372 + }, + { + "completion_length": 151.125, + "epoch": 0.34325, + "grad_norm": 0.02153955027461052, + "kl": 0.04990649223327637, + "learning_rate": 4.15164254550456e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1373 + }, + { + "completion_length": 172.25, + "epoch": 0.3435, + "grad_norm": 0.7357903718948364, + "kl": 0.07494698464870453, + "learning_rate": 4.150004169902343e-06, + "loss": 0.003, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1374 + }, + { + "completion_length": 138.75, + "epoch": 0.34375, + "grad_norm": 0.6570234894752502, + "kl": 0.11442865431308746, + "learning_rate": 4.1483645377501726e-06, + "loss": 0.0046, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 1375 + }, + { + "completion_length": 145.75, + "epoch": 0.344, + "grad_norm": 0.03160668909549713, + "kl": 0.057240959256887436, + "learning_rate": 4.146723650296701e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1376 + }, + { + "completion_length": 139.5, + "epoch": 0.34425, + "grad_norm": 0.6059741973876953, + "kl": 0.05288849398493767, + "learning_rate": 4.145081508791536e-06, + "loss": 0.0021, + "reward": 1.7575000524520874, + "reward_std": 0.449150025844574, + "rewards/_accuracy_reward": 0.7575000524520874, + "rewards/_format_reward": 1.0, + "step": 1377 + }, + { + "completion_length": 166.0, + "epoch": 0.3445, + "grad_norm": 0.6067208647727966, + "kl": 0.056651707738637924, + "learning_rate": 4.14343811448524e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1378 + }, + { + "completion_length": 162.0, + "epoch": 0.34475, + "grad_norm": 0.5100969672203064, + "kl": 0.06123419851064682, + "learning_rate": 4.141793468629327e-06, + "loss": 0.0024, + "reward": 1.181249976158142, + "reward_std": 0.6335486769676208, + "rewards/_accuracy_reward": 0.3062499761581421, + "rewards/_format_reward": 0.875, + "step": 1379 + }, + { + "completion_length": 117.125, + "epoch": 0.345, + "grad_norm": 0.5899714231491089, + "kl": 0.07519141584634781, + "learning_rate": 4.140147572476269e-06, + "loss": 0.003, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1380 + }, + { + "completion_length": 179.625, + "epoch": 0.34525, + "grad_norm": 0.471913605928421, + "kl": 0.04343201965093613, + "learning_rate": 4.138500427279485e-06, + "loss": 0.0017, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1381 + }, + { + "completion_length": 120.875, + "epoch": 0.3455, + "grad_norm": 0.021074136719107628, + "kl": 0.058206070214509964, + "learning_rate": 4.136852034293349e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1382 + }, + { + "completion_length": 163.0, + "epoch": 0.34575, + "grad_norm": 0.5675240159034729, + "kl": 0.05850347504019737, + "learning_rate": 4.135202394773186e-06, + "loss": 0.0023, + "reward": 1.0374999046325684, + "reward_std": 0.5350233912467957, + "rewards/_accuracy_reward": 0.16249999403953552, + "rewards/_format_reward": 0.875, + "step": 1383 + }, + { + "completion_length": 139.25, + "epoch": 0.346, + "grad_norm": 0.5779051184654236, + "kl": 0.029702937230467796, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0012, + "reward": 1.7825000286102295, + "reward_std": 0.40780770778656006, + "rewards/_accuracy_reward": 0.7825000286102295, + "rewards/_format_reward": 1.0, + "step": 1384 + }, + { + "completion_length": 109.125, + "epoch": 0.34625, + "grad_norm": 0.026169802993535995, + "kl": 0.051198095083236694, + "learning_rate": 4.1318993811568065e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1385 + }, + { + "completion_length": 98.5, + "epoch": 0.3465, + "grad_norm": 0.02033095993101597, + "kl": 0.06018479913473129, + "learning_rate": 4.130246009575981e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1386 + }, + { + "completion_length": 134.75, + "epoch": 0.34675, + "grad_norm": 0.7308709025382996, + "kl": 0.06635252386331558, + "learning_rate": 4.128591396491901e-06, + "loss": 0.0027, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1387 + }, + { + "completion_length": 181.875, + "epoch": 0.347, + "grad_norm": 0.5132285356521606, + "kl": 0.03996492922306061, + "learning_rate": 4.126935543164628e-06, + "loss": 0.0016, + "reward": 1.1437499523162842, + "reward_std": 0.8317097425460815, + "rewards/_accuracy_reward": 0.39374998211860657, + "rewards/_format_reward": 0.75, + "step": 1388 + }, + { + "completion_length": 199.75, + "epoch": 0.34725, + "grad_norm": 0.6100365519523621, + "kl": 0.058336708694696426, + "learning_rate": 4.125278450855165e-06, + "loss": 0.0023, + "reward": 1.6262500286102295, + "reward_std": 0.7428312301635742, + "rewards/_accuracy_reward": 0.7512500286102295, + "rewards/_format_reward": 0.875, + "step": 1389 + }, + { + "completion_length": 157.0, + "epoch": 0.3475, + "grad_norm": 0.668353259563446, + "kl": 0.056433651596307755, + "learning_rate": 4.123620120825459e-06, + "loss": 0.0023, + "reward": 1.6637499332427979, + "reward_std": 0.4691310524940491, + "rewards/_accuracy_reward": 0.6637500524520874, + "rewards/_format_reward": 1.0, + "step": 1390 + }, + { + "completion_length": 138.75, + "epoch": 0.34775, + "grad_norm": 0.022827059030532837, + "kl": 0.043130144476890564, + "learning_rate": 4.1219605543384036e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1391 + }, + { + "completion_length": 169.625, + "epoch": 0.348, + "grad_norm": 0.615968644618988, + "kl": 0.045771509408950806, + "learning_rate": 4.120299752657828e-06, + "loss": 0.0018, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 1392 + }, + { + "completion_length": 185.75, + "epoch": 0.34825, + "grad_norm": 0.035993240773677826, + "kl": 0.0693710520863533, + "learning_rate": 4.1186377170485055e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1393 + }, + { + "completion_length": 187.0, + "epoch": 0.3485, + "grad_norm": 0.5210174918174744, + "kl": 0.05211419612169266, + "learning_rate": 4.11697444877615e-06, + "loss": 0.0021, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1394 + }, + { + "completion_length": 156.875, + "epoch": 0.34875, + "grad_norm": 0.7188504934310913, + "kl": 0.061935752630233765, + "learning_rate": 4.11530994910741e-06, + "loss": 0.0025, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1395 + }, + { + "completion_length": 125.5, + "epoch": 0.349, + "grad_norm": 0.8065851330757141, + "kl": 0.08348346501588821, + "learning_rate": 4.113644219309877e-06, + "loss": 0.0033, + "reward": 1.600000023841858, + "reward_std": 0.43260011076927185, + "rewards/_accuracy_reward": 0.6000000238418579, + "rewards/_format_reward": 1.0, + "step": 1396 + }, + { + "completion_length": 168.875, + "epoch": 0.34925, + "grad_norm": 0.6905811429023743, + "kl": 0.05310038477182388, + "learning_rate": 4.1119772606520755e-06, + "loss": 0.0021, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1397 + }, + { + "completion_length": 166.875, + "epoch": 0.3495, + "grad_norm": 0.5804812908172607, + "kl": 0.05809628963470459, + "learning_rate": 4.110309074403467e-06, + "loss": 0.0023, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 1398 + }, + { + "completion_length": 125.75, + "epoch": 0.34975, + "grad_norm": 0.813575029373169, + "kl": 0.034708425402641296, + "learning_rate": 4.1086396618344474e-06, + "loss": 0.0014, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 1399 + }, + { + "completion_length": 102.125, + "epoch": 0.35, + "grad_norm": 0.6382107734680176, + "kl": 0.07151403278112411, + "learning_rate": 4.106969024216348e-06, + "loss": 0.0029, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1400 + }, + { + "completion_length": 183.875, + "epoch": 0.35025, + "grad_norm": 0.5745235681533813, + "kl": 0.06930546462535858, + "learning_rate": 4.105297162821433e-06, + "loss": 0.0028, + "reward": 1.7825000286102295, + "reward_std": 0.40780770778656006, + "rewards/_accuracy_reward": 0.7825000286102295, + "rewards/_format_reward": 1.0, + "step": 1401 + }, + { + "completion_length": 124.0, + "epoch": 0.3505, + "grad_norm": 0.7871240377426147, + "kl": 0.05033106729388237, + "learning_rate": 4.103624078922895e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1402 + }, + { + "completion_length": 108.25, + "epoch": 0.35075, + "grad_norm": 0.8062891364097595, + "kl": 0.0857778936624527, + "learning_rate": 4.101949773794862e-06, + "loss": 0.0034, + "reward": 1.78125, + "reward_std": 0.6187184453010559, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 0.875, + "step": 1403 + }, + { + "completion_length": 213.875, + "epoch": 0.351, + "grad_norm": 0.6212908029556274, + "kl": 0.06841997802257538, + "learning_rate": 4.1002742487123896e-06, + "loss": 0.0027, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/_accuracy_reward": 0.625, + "rewards/_format_reward": 0.625, + "step": 1404 + }, + { + "completion_length": 161.5, + "epoch": 0.35125, + "grad_norm": 0.7645436525344849, + "kl": 0.07526696473360062, + "learning_rate": 4.098597504951462e-06, + "loss": 0.003, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1405 + }, + { + "completion_length": 195.75, + "epoch": 0.3515, + "grad_norm": 0.8574343323707581, + "kl": 0.08652313798666, + "learning_rate": 4.096919543788995e-06, + "loss": 0.0035, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 1406 + }, + { + "completion_length": 135.375, + "epoch": 0.35175, + "grad_norm": 0.03515785187482834, + "kl": 0.06902702897787094, + "learning_rate": 4.095240366502827e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1407 + }, + { + "completion_length": 163.5, + "epoch": 0.352, + "grad_norm": 0.5809192061424255, + "kl": 0.039451714605093, + "learning_rate": 4.093559974371725e-06, + "loss": 0.0016, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1408 + }, + { + "completion_length": 131.25, + "epoch": 0.35225, + "grad_norm": 1.0675450563430786, + "kl": 0.037023499608039856, + "learning_rate": 4.09187836867538e-06, + "loss": 0.0015, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1409 + }, + { + "completion_length": 126.5, + "epoch": 0.3525, + "grad_norm": 0.03956000879406929, + "kl": 0.05268242955207825, + "learning_rate": 4.09019555069441e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1410 + }, + { + "completion_length": 158.5, + "epoch": 0.35275, + "grad_norm": 1.107899785041809, + "kl": 0.06893620640039444, + "learning_rate": 4.088511521710353e-06, + "loss": 0.0028, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.7625000476837158, + "rewards/_format_reward": 1.0, + "step": 1411 + }, + { + "completion_length": 83.625, + "epoch": 0.353, + "grad_norm": 0.020886188372969627, + "kl": 0.08969815075397491, + "learning_rate": 4.086826283005669e-06, + "loss": 0.0036, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1412 + }, + { + "completion_length": 149.125, + "epoch": 0.35325, + "grad_norm": 0.8237113952636719, + "kl": 0.060487691313028336, + "learning_rate": 4.085139835863743e-06, + "loss": 0.0024, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1413 + }, + { + "completion_length": 172.375, + "epoch": 0.3535, + "grad_norm": 0.7035109996795654, + "kl": 0.058261074125766754, + "learning_rate": 4.083452181568876e-06, + "loss": 0.0023, + "reward": 1.3125, + "reward_std": 0.4299086928367615, + "rewards/_accuracy_reward": 0.3125, + "rewards/_format_reward": 1.0, + "step": 1414 + }, + { + "completion_length": 164.5, + "epoch": 0.35375, + "grad_norm": 0.5613075494766235, + "kl": 0.08956658095121384, + "learning_rate": 4.081763321406291e-06, + "loss": 0.0036, + "reward": 1.2687499523162842, + "reward_std": 0.699968159198761, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.75, + "step": 1415 + }, + { + "completion_length": 180.25, + "epoch": 0.354, + "grad_norm": 1.1197295188903809, + "kl": 0.09375526010990143, + "learning_rate": 4.080073256662128e-06, + "loss": 0.0038, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1416 + }, + { + "completion_length": 128.875, + "epoch": 0.35425, + "grad_norm": 0.031155169010162354, + "kl": 0.0490286611020565, + "learning_rate": 4.078381988623445e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1417 + }, + { + "completion_length": 174.625, + "epoch": 0.3545, + "grad_norm": 0.612561047077179, + "kl": 0.06778530031442642, + "learning_rate": 4.076689518578217e-06, + "loss": 0.0027, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.6375000476837158, + "rewards/_format_reward": 0.875, + "step": 1418 + }, + { + "completion_length": 87.25, + "epoch": 0.35475, + "grad_norm": 0.05403704568743706, + "kl": 0.04753972589969635, + "learning_rate": 4.074995847815331e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1419 + }, + { + "completion_length": 144.375, + "epoch": 0.355, + "grad_norm": 0.03159556910395622, + "kl": 0.05760689452290535, + "learning_rate": 4.073300977624594e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1420 + }, + { + "completion_length": 162.625, + "epoch": 0.35525, + "grad_norm": 0.7645871043205261, + "kl": 0.0827740728855133, + "learning_rate": 4.0716049092967224e-06, + "loss": 0.0033, + "reward": 1.3125, + "reward_std": 0.4299086630344391, + "rewards/_accuracy_reward": 0.3124999701976776, + "rewards/_format_reward": 1.0, + "step": 1421 + }, + { + "completion_length": 120.75, + "epoch": 0.3555, + "grad_norm": 0.05223441123962402, + "kl": 0.05872947722673416, + "learning_rate": 4.069907644123346e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1422 + }, + { + "completion_length": 133.625, + "epoch": 0.35575, + "grad_norm": 0.7665608525276184, + "kl": 0.04824737831950188, + "learning_rate": 4.068209183397005e-06, + "loss": 0.0019, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 1.0, + "step": 1423 + }, + { + "completion_length": 93.625, + "epoch": 0.356, + "grad_norm": 0.898429274559021, + "kl": 0.10655061900615692, + "learning_rate": 4.066509528411151e-06, + "loss": 0.0043, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1424 + }, + { + "completion_length": 179.625, + "epoch": 0.35625, + "grad_norm": 0.7845519781112671, + "kl": 0.0487191379070282, + "learning_rate": 4.064808680460149e-06, + "loss": 0.0019, + "reward": 1.4562499523162842, + "reward_std": 0.45781898498535156, + "rewards/_accuracy_reward": 0.45624998211860657, + "rewards/_format_reward": 1.0, + "step": 1425 + }, + { + "completion_length": 171.625, + "epoch": 0.3565, + "grad_norm": 0.8606228828430176, + "kl": 0.05676417797803879, + "learning_rate": 4.063106640839264e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1426 + }, + { + "completion_length": 164.25, + "epoch": 0.35675, + "grad_norm": 0.6395389437675476, + "kl": 0.07135939598083496, + "learning_rate": 4.061403410844674e-06, + "loss": 0.0029, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1427 + }, + { + "completion_length": 193.0, + "epoch": 0.357, + "grad_norm": 0.5392956137657166, + "kl": 0.048543695360422134, + "learning_rate": 4.059698991773466e-06, + "loss": 0.0019, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1428 + }, + { + "completion_length": 176.625, + "epoch": 0.35725, + "grad_norm": 0.7690248489379883, + "kl": 0.0833929181098938, + "learning_rate": 4.057993384923626e-06, + "loss": 0.0033, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1429 + }, + { + "completion_length": 133.75, + "epoch": 0.3575, + "grad_norm": 0.643065869808197, + "kl": 0.059519246220588684, + "learning_rate": 4.056286591594049e-06, + "loss": 0.0024, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 1430 + }, + { + "completion_length": 118.875, + "epoch": 0.35775, + "grad_norm": 0.72835773229599, + "kl": 0.03510938957333565, + "learning_rate": 4.0545786130845325e-06, + "loss": 0.0014, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 1431 + }, + { + "completion_length": 85.375, + "epoch": 0.358, + "grad_norm": 0.031654562801122665, + "kl": 0.03833214193582535, + "learning_rate": 4.052869450695776e-06, + "loss": 0.0015, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1432 + }, + { + "completion_length": 117.0, + "epoch": 0.35825, + "grad_norm": 0.052805446088314056, + "kl": 0.07381996512413025, + "learning_rate": 4.051159105729382e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1433 + }, + { + "completion_length": 143.875, + "epoch": 0.3585, + "grad_norm": 0.6013376116752625, + "kl": 0.05232910066843033, + "learning_rate": 4.049447579487851e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1434 + }, + { + "completion_length": 176.5, + "epoch": 0.35875, + "grad_norm": 0.03941601887345314, + "kl": 0.06202106177806854, + "learning_rate": 4.047734873274586e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1435 + }, + { + "completion_length": 221.125, + "epoch": 0.359, + "grad_norm": 0.7312876582145691, + "kl": 0.07595561444759369, + "learning_rate": 4.046020988393886e-06, + "loss": 0.003, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.2812499701976776, + "rewards/_format_reward": 0.875, + "step": 1436 + }, + { + "completion_length": 173.5, + "epoch": 0.35925, + "grad_norm": 0.7492666840553284, + "kl": 0.0508357509970665, + "learning_rate": 4.04430592615095e-06, + "loss": 0.002, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1437 + }, + { + "completion_length": 89.875, + "epoch": 0.3595, + "grad_norm": 1.046403169631958, + "kl": 0.052254389971494675, + "learning_rate": 4.0425896878518725e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1438 + }, + { + "completion_length": 149.375, + "epoch": 0.35975, + "grad_norm": 0.6857521533966064, + "kl": 0.06438186764717102, + "learning_rate": 4.0408722748036426e-06, + "loss": 0.0026, + "reward": 1.631250023841858, + "reward_std": 0.7382108569145203, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 1439 + }, + { + "completion_length": 160.25, + "epoch": 0.36, + "grad_norm": 0.8094133138656616, + "kl": 0.07312402129173279, + "learning_rate": 4.039153688314146e-06, + "loss": 0.0029, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1440 + }, + { + "completion_length": 129.625, + "epoch": 0.36025, + "grad_norm": 0.045799799263477325, + "kl": 0.053429555147886276, + "learning_rate": 4.037433929692161e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1441 + }, + { + "completion_length": 164.875, + "epoch": 0.3605, + "grad_norm": 0.8309241533279419, + "kl": 0.08402340859174728, + "learning_rate": 4.035713000247358e-06, + "loss": 0.0034, + "reward": 1.756250023841858, + "reward_std": 0.45153507590293884, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 1442 + }, + { + "completion_length": 129.625, + "epoch": 0.36075, + "grad_norm": 0.04994899779558182, + "kl": 0.08733442425727844, + "learning_rate": 4.033990901290301e-06, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1443 + }, + { + "completion_length": 160.0, + "epoch": 0.361, + "grad_norm": 0.78934246301651, + "kl": 0.07535295933485031, + "learning_rate": 4.032267634132442e-06, + "loss": 0.003, + "reward": 1.631250023841858, + "reward_std": 0.7382108569145203, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 1444 + }, + { + "completion_length": 167.625, + "epoch": 0.36125, + "grad_norm": 0.8826912045478821, + "kl": 0.05429309234023094, + "learning_rate": 4.0305432000861236e-06, + "loss": 0.0022, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1445 + }, + { + "completion_length": 174.25, + "epoch": 0.3615, + "grad_norm": 1.0357098579406738, + "kl": 0.0778610110282898, + "learning_rate": 4.028817600464579e-06, + "loss": 0.0031, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1446 + }, + { + "completion_length": 117.25, + "epoch": 0.36175, + "grad_norm": 0.7004038691520691, + "kl": 0.046138960868120193, + "learning_rate": 4.027090836581925e-06, + "loss": 0.0018, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1447 + }, + { + "completion_length": 145.125, + "epoch": 0.362, + "grad_norm": 0.6335418224334717, + "kl": 0.05013753101229668, + "learning_rate": 4.02536290975317e-06, + "loss": 0.002, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1448 + }, + { + "completion_length": 169.5, + "epoch": 0.36225, + "grad_norm": 1.1653978824615479, + "kl": 0.04540247470140457, + "learning_rate": 4.023633821294203e-06, + "loss": 0.0018, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.28749996423721313, + "rewards/_format_reward": 1.0, + "step": 1449 + }, + { + "completion_length": 132.375, + "epoch": 0.3625, + "grad_norm": 0.023695236071944237, + "kl": 0.04884612187743187, + "learning_rate": 4.021903572521802e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1450 + }, + { + "completion_length": 147.5, + "epoch": 0.36275, + "grad_norm": 2.0923967361450195, + "kl": 0.08431154489517212, + "learning_rate": 4.020172164753626e-06, + "loss": 0.0034, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1451 + }, + { + "completion_length": 191.375, + "epoch": 0.363, + "grad_norm": 0.975801408290863, + "kl": 0.09080733358860016, + "learning_rate": 4.018439599308217e-06, + "loss": 0.0036, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1452 + }, + { + "completion_length": 124.5, + "epoch": 0.36325, + "grad_norm": 0.7811964154243469, + "kl": 0.06944286823272705, + "learning_rate": 4.016705877504999e-06, + "loss": 0.0028, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1453 + }, + { + "completion_length": 187.375, + "epoch": 0.3635, + "grad_norm": 0.9031335711479187, + "kl": 0.07759063690900803, + "learning_rate": 4.0149710006642775e-06, + "loss": 0.0031, + "reward": 1.2625000476837158, + "reward_std": 0.8826704621315002, + "rewards/_accuracy_reward": 0.512499988079071, + "rewards/_format_reward": 0.75, + "step": 1454 + }, + { + "completion_length": 128.375, + "epoch": 0.36375, + "grad_norm": 1.2263360023498535, + "kl": 0.0753258615732193, + "learning_rate": 4.013234970107236e-06, + "loss": 0.003, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1455 + }, + { + "completion_length": 106.0, + "epoch": 0.364, + "grad_norm": 0.08058945089578629, + "kl": 0.07110590487718582, + "learning_rate": 4.011497787155938e-06, + "loss": 0.0028, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1456 + }, + { + "completion_length": 172.875, + "epoch": 0.36425, + "grad_norm": 0.7150065898895264, + "kl": 0.09918969124555588, + "learning_rate": 4.009759453133322e-06, + "loss": 0.004, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.875, + "step": 1457 + }, + { + "completion_length": 172.5, + "epoch": 0.3645, + "grad_norm": 1.6500592231750488, + "kl": 0.11502061039209366, + "learning_rate": 4.008019969363206e-06, + "loss": 0.0046, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.875, + "step": 1458 + }, + { + "completion_length": 185.625, + "epoch": 0.36475, + "grad_norm": 0.7705068588256836, + "kl": 0.10530376434326172, + "learning_rate": 4.006279337170283e-06, + "loss": 0.0042, + "reward": 1.5325000286102295, + "reward_std": 0.7362210750579834, + "rewards/_accuracy_reward": 0.6575000286102295, + "rewards/_format_reward": 0.875, + "step": 1459 + }, + { + "completion_length": 140.0, + "epoch": 0.365, + "grad_norm": 2.920241117477417, + "kl": 0.1508193165063858, + "learning_rate": 4.0045375578801216e-06, + "loss": 0.006, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1460 + }, + { + "completion_length": 148.25, + "epoch": 0.36525, + "grad_norm": 2.5213630199432373, + "kl": 0.07739417254924774, + "learning_rate": 4.002794632819159e-06, + "loss": 0.0031, + "reward": 1.274999976158142, + "reward_std": 0.6974443197250366, + "rewards/_accuracy_reward": 0.3999999761581421, + "rewards/_format_reward": 0.875, + "step": 1461 + }, + { + "completion_length": 155.0, + "epoch": 0.3655, + "grad_norm": 2.9162180423736572, + "kl": 0.13509155809879303, + "learning_rate": 4.001050563314711e-06, + "loss": 0.0054, + "reward": 1.7825000286102295, + "reward_std": 0.40780770778656006, + "rewards/_accuracy_reward": 0.7825000286102295, + "rewards/_format_reward": 1.0, + "step": 1462 + }, + { + "completion_length": 136.25, + "epoch": 0.36575, + "grad_norm": 0.4022398591041565, + "kl": 0.10202545672655106, + "learning_rate": 3.999305350694961e-06, + "loss": 0.0041, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1463 + }, + { + "completion_length": 137.75, + "epoch": 0.366, + "grad_norm": 0.6191596388816833, + "kl": 0.11961022019386292, + "learning_rate": 3.997558996288965e-06, + "loss": 0.0048, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1464 + }, + { + "completion_length": 188.375, + "epoch": 0.36625, + "grad_norm": 0.6211814284324646, + "kl": 0.09365051239728928, + "learning_rate": 3.995811501426648e-06, + "loss": 0.0037, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1465 + }, + { + "completion_length": 181.5, + "epoch": 0.3665, + "grad_norm": 0.8067208528518677, + "kl": 0.11959419399499893, + "learning_rate": 3.994062867438803e-06, + "loss": 0.0048, + "reward": 1.662500023841858, + "reward_std": 0.6604922413825989, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 0.875, + "step": 1466 + }, + { + "completion_length": 181.5, + "epoch": 0.36675, + "grad_norm": 0.5875232219696045, + "kl": 0.07762034982442856, + "learning_rate": 3.992313095657091e-06, + "loss": 0.0031, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1467 + }, + { + "completion_length": 160.25, + "epoch": 0.367, + "grad_norm": 0.6259009838104248, + "kl": 0.08514675498008728, + "learning_rate": 3.9905621874140396e-06, + "loss": 0.0034, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1468 + }, + { + "completion_length": 92.875, + "epoch": 0.36725, + "grad_norm": 0.023967457935214043, + "kl": 0.02864246629178524, + "learning_rate": 3.988810144043041e-06, + "loss": 0.0011, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1469 + }, + { + "completion_length": 175.875, + "epoch": 0.3675, + "grad_norm": 0.8310546278953552, + "kl": 0.0942971259355545, + "learning_rate": 3.987056966878354e-06, + "loss": 0.0038, + "reward": 1.6387500762939453, + "reward_std": 0.49872517585754395, + "rewards/_accuracy_reward": 0.6387499570846558, + "rewards/_format_reward": 1.0, + "step": 1470 + }, + { + "completion_length": 161.75, + "epoch": 0.36775, + "grad_norm": 0.03448256105184555, + "kl": 0.05922067165374756, + "learning_rate": 3.985302657255097e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1471 + }, + { + "completion_length": 141.75, + "epoch": 0.368, + "grad_norm": 0.02368452027440071, + "kl": 0.055194176733493805, + "learning_rate": 3.983547216509254e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1472 + }, + { + "completion_length": 120.625, + "epoch": 0.36825, + "grad_norm": 0.04141707718372345, + "kl": 0.04475046694278717, + "learning_rate": 3.98179064597767e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1473 + }, + { + "completion_length": 155.5, + "epoch": 0.3685, + "grad_norm": 0.587161123752594, + "kl": 0.056352969259023666, + "learning_rate": 3.9800329469980495e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1474 + }, + { + "completion_length": 162.125, + "epoch": 0.36875, + "grad_norm": 0.6867335438728333, + "kl": 0.06760282069444656, + "learning_rate": 3.978274120908957e-06, + "loss": 0.0027, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 1475 + }, + { + "completion_length": 119.375, + "epoch": 0.369, + "grad_norm": 0.8442899584770203, + "kl": 0.07957350462675095, + "learning_rate": 3.976514169049814e-06, + "loss": 0.0032, + "reward": 1.658750057220459, + "reward_std": 0.47675803303718567, + "rewards/_accuracy_reward": 0.6587499976158142, + "rewards/_format_reward": 1.0, + "step": 1476 + }, + { + "completion_length": 120.875, + "epoch": 0.36925, + "grad_norm": 0.03868165612220764, + "kl": 0.05161227658390999, + "learning_rate": 3.974753092760901e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1477 + }, + { + "completion_length": 141.375, + "epoch": 0.3695, + "grad_norm": 0.7526807188987732, + "kl": 0.058717839419841766, + "learning_rate": 3.972990893383356e-06, + "loss": 0.0023, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1478 + }, + { + "completion_length": 106.875, + "epoch": 0.36975, + "grad_norm": 0.7434094548225403, + "kl": 0.12921760976314545, + "learning_rate": 3.971227572259167e-06, + "loss": 0.0052, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1479 + }, + { + "completion_length": 175.25, + "epoch": 0.37, + "grad_norm": 0.6596071124076843, + "kl": 0.08641275018453598, + "learning_rate": 3.969463130731183e-06, + "loss": 0.0035, + "reward": 1.15625, + "reward_std": 0.6343936920166016, + "rewards/_accuracy_reward": 0.28125, + "rewards/_format_reward": 0.875, + "step": 1480 + }, + { + "completion_length": 208.0, + "epoch": 0.37025, + "grad_norm": 0.03975485637784004, + "kl": 0.06477142870426178, + "learning_rate": 3.9676975701431016e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1481 + }, + { + "completion_length": 132.25, + "epoch": 0.3705, + "grad_norm": 0.6661024689674377, + "kl": 0.058449484407901764, + "learning_rate": 3.965930891839473e-06, + "loss": 0.0023, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1482 + }, + { + "completion_length": 135.875, + "epoch": 0.37075, + "grad_norm": 0.8266745805740356, + "kl": 0.0733661875128746, + "learning_rate": 3.964163097165702e-06, + "loss": 0.0029, + "reward": 1.0999999046325684, + "reward_std": 0.09258202463388443, + "rewards/_accuracy_reward": 0.10000000149011612, + "rewards/_format_reward": 1.0, + "step": 1483 + }, + { + "completion_length": 114.125, + "epoch": 0.371, + "grad_norm": 0.01836472377181053, + "kl": 0.08049993216991425, + "learning_rate": 3.96239418746804e-06, + "loss": 0.0032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1484 + }, + { + "completion_length": 162.5, + "epoch": 0.37125, + "grad_norm": 0.5880891680717468, + "kl": 0.0393814891576767, + "learning_rate": 3.960624164093587e-06, + "loss": 0.0016, + "reward": 1.787500023841858, + "reward_std": 0.39708763360977173, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1485 + }, + { + "completion_length": 148.75, + "epoch": 0.3715, + "grad_norm": 0.5472244024276733, + "kl": 0.039617154747247696, + "learning_rate": 3.958853028390294e-06, + "loss": 0.0016, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1486 + }, + { + "completion_length": 150.875, + "epoch": 0.37175, + "grad_norm": 1.396892786026001, + "kl": 0.06323693692684174, + "learning_rate": 3.957080781706959e-06, + "loss": 0.0025, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1487 + }, + { + "completion_length": 188.875, + "epoch": 0.372, + "grad_norm": 0.034602127969264984, + "kl": 0.06623050570487976, + "learning_rate": 3.955307425393224e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1488 + }, + { + "completion_length": 111.25, + "epoch": 0.37225, + "grad_norm": 0.021022455766797066, + "kl": 0.08140836656093597, + "learning_rate": 3.953532960799577e-06, + "loss": 0.0033, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1489 + }, + { + "completion_length": 174.5, + "epoch": 0.3725, + "grad_norm": 0.3498646914958954, + "kl": 0.04529913514852524, + "learning_rate": 3.951757389277349e-06, + "loss": 0.0018, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1490 + }, + { + "completion_length": 172.25, + "epoch": 0.37275, + "grad_norm": 0.4678153991699219, + "kl": 0.04285871610045433, + "learning_rate": 3.949980712178718e-06, + "loss": 0.0017, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1491 + }, + { + "completion_length": 112.125, + "epoch": 0.373, + "grad_norm": 0.8213009834289551, + "kl": 0.0340723879635334, + "learning_rate": 3.948202930856697e-06, + "loss": 0.0014, + "reward": 1.756250023841858, + "reward_std": 0.6894291639328003, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 1492 + }, + { + "completion_length": 178.75, + "epoch": 0.37325, + "grad_norm": 0.7336386442184448, + "kl": 0.05348341166973114, + "learning_rate": 3.946424046665147e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1493 + }, + { + "completion_length": 123.625, + "epoch": 0.3735, + "grad_norm": 0.9568604230880737, + "kl": 0.06122196465730667, + "learning_rate": 3.944644060958764e-06, + "loss": 0.0024, + "reward": 1.5625, + "reward_std": 0.7165144085884094, + "rewards/_accuracy_reward": 0.6875, + "rewards/_format_reward": 0.875, + "step": 1494 + }, + { + "completion_length": 191.25, + "epoch": 0.37375, + "grad_norm": 0.7187319993972778, + "kl": 0.03870750963687897, + "learning_rate": 3.942862975093085e-06, + "loss": 0.0015, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1495 + }, + { + "completion_length": 116.5, + "epoch": 0.374, + "grad_norm": 0.014674518257379532, + "kl": 0.07812533527612686, + "learning_rate": 3.941080790424483e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1496 + }, + { + "completion_length": 140.625, + "epoch": 0.37425, + "grad_norm": 0.6387913823127747, + "kl": 0.03343196585774422, + "learning_rate": 3.939297508310172e-06, + "loss": 0.0013, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1497 + }, + { + "completion_length": 182.125, + "epoch": 0.3745, + "grad_norm": 0.4836607277393341, + "kl": 0.03932333365082741, + "learning_rate": 3.9375131301081974e-06, + "loss": 0.0016, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1498 + }, + { + "completion_length": 160.75, + "epoch": 0.37475, + "grad_norm": 0.6728492379188538, + "kl": 0.043070126324892044, + "learning_rate": 3.935727657177439e-06, + "loss": 0.0017, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.6375000476837158, + "rewards/_format_reward": 0.875, + "step": 1499 + }, + { + "completion_length": 191.75, + "epoch": 0.375, + "grad_norm": 0.5258508920669556, + "kl": 0.03584269806742668, + "learning_rate": 3.933941090877615e-06, + "loss": 0.0014, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1500 + }, + { + "completion_length": 94.25, + "epoch": 0.37525, + "grad_norm": 0.016977330669760704, + "kl": 0.021302910521626472, + "learning_rate": 3.932153432569273e-06, + "loss": 0.0009, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1501 + }, + { + "completion_length": 82.375, + "epoch": 0.3755, + "grad_norm": 0.9392129778862, + "kl": 0.0853070318698883, + "learning_rate": 3.930364683613791e-06, + "loss": 0.0034, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1502 + }, + { + "completion_length": 150.75, + "epoch": 0.37575, + "grad_norm": 0.03836243599653244, + "kl": 0.0422300361096859, + "learning_rate": 3.92857484537338e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1503 + }, + { + "completion_length": 190.0, + "epoch": 0.376, + "grad_norm": 0.571064829826355, + "kl": 0.048360809683799744, + "learning_rate": 3.92678391921108e-06, + "loss": 0.0019, + "reward": 1.5187499523162842, + "reward_std": 0.5147382020950317, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 0.875, + "step": 1504 + }, + { + "completion_length": 112.375, + "epoch": 0.37625, + "grad_norm": 0.7876760363578796, + "kl": 0.04685162380337715, + "learning_rate": 3.924991906490758e-06, + "loss": 0.0019, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.643750011920929, + "rewards/_format_reward": 1.0, + "step": 1505 + }, + { + "completion_length": 185.25, + "epoch": 0.3765, + "grad_norm": 0.451773077249527, + "kl": 0.034047823399305344, + "learning_rate": 3.923198808577111e-06, + "loss": 0.0014, + "reward": 1.537500023841858, + "reward_std": 0.7322909235954285, + "rewards/_accuracy_reward": 0.6625000238418579, + "rewards/_format_reward": 0.875, + "step": 1506 + }, + { + "completion_length": 115.625, + "epoch": 0.37675, + "grad_norm": 0.6053922176361084, + "kl": 0.058970190584659576, + "learning_rate": 3.921404626835661e-06, + "loss": 0.0024, + "reward": 1.787500023841858, + "reward_std": 0.39708760380744934, + "rewards/_accuracy_reward": 0.7875000238418579, + "rewards/_format_reward": 1.0, + "step": 1507 + }, + { + "completion_length": 195.625, + "epoch": 0.377, + "grad_norm": 0.6031469702720642, + "kl": 0.041082918643951416, + "learning_rate": 3.9196093626327535e-06, + "loss": 0.0016, + "reward": 1.0187499523162842, + "reward_std": 0.9184371829032898, + "rewards/_accuracy_reward": 0.39374998211860657, + "rewards/_format_reward": 0.625, + "step": 1508 + }, + { + "completion_length": 165.5, + "epoch": 0.37725, + "grad_norm": 0.4825488328933716, + "kl": 0.02711324580013752, + "learning_rate": 3.917813017335562e-06, + "loss": 0.0011, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1509 + }, + { + "completion_length": 189.75, + "epoch": 0.3775, + "grad_norm": 0.6947501301765442, + "kl": 0.04373620077967644, + "learning_rate": 3.916015592312083e-06, + "loss": 0.0017, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1510 + }, + { + "completion_length": 156.5, + "epoch": 0.37775, + "grad_norm": 0.613882303237915, + "kl": 0.045684535056352615, + "learning_rate": 3.9142170889311305e-06, + "loss": 0.0018, + "reward": 1.693750023841858, + "reward_std": 0.42714792490005493, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 1.0, + "step": 1511 + }, + { + "completion_length": 177.375, + "epoch": 0.378, + "grad_norm": 0.01667964830994606, + "kl": 0.03237161785364151, + "learning_rate": 3.912417508562345e-06, + "loss": 0.0013, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1512 + }, + { + "completion_length": 198.5, + "epoch": 0.37825, + "grad_norm": 0.6279807686805725, + "kl": 0.05605170503258705, + "learning_rate": 3.910616852576186e-06, + "loss": 0.0022, + "reward": 1.21875, + "reward_std": 0.8284828662872314, + "rewards/_accuracy_reward": 0.46875, + "rewards/_format_reward": 0.75, + "step": 1513 + }, + { + "completion_length": 101.625, + "epoch": 0.3785, + "grad_norm": 0.06471621990203857, + "kl": 0.05444112420082092, + "learning_rate": 3.908815122343929e-06, + "loss": 0.0022, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1514 + }, + { + "completion_length": 142.25, + "epoch": 0.37875, + "grad_norm": 0.6215490698814392, + "kl": 0.04951619729399681, + "learning_rate": 3.907012319237672e-06, + "loss": 0.002, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1515 + }, + { + "completion_length": 179.875, + "epoch": 0.379, + "grad_norm": 0.561939001083374, + "kl": 0.0519590750336647, + "learning_rate": 3.905208444630326e-06, + "loss": 0.0021, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1516 + }, + { + "completion_length": 126.125, + "epoch": 0.37925, + "grad_norm": 0.022524600848555565, + "kl": 0.04927676171064377, + "learning_rate": 3.903403499895624e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1517 + }, + { + "completion_length": 147.875, + "epoch": 0.3795, + "grad_norm": 0.5980425477027893, + "kl": 0.03771920129656792, + "learning_rate": 3.901597486408105e-06, + "loss": 0.0015, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1518 + }, + { + "completion_length": 130.625, + "epoch": 0.37975, + "grad_norm": 0.9228925704956055, + "kl": 0.04406380653381348, + "learning_rate": 3.899790405543129e-06, + "loss": 0.0018, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 1519 + }, + { + "completion_length": 128.625, + "epoch": 0.38, + "grad_norm": 0.02237652614712715, + "kl": 0.05214867368340492, + "learning_rate": 3.897982258676867e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1520 + }, + { + "completion_length": 152.375, + "epoch": 0.38025, + "grad_norm": 0.6844654679298401, + "kl": 0.05311375483870506, + "learning_rate": 3.896173047186302e-06, + "loss": 0.0021, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.2874999940395355, + "rewards/_format_reward": 1.0, + "step": 1521 + }, + { + "completion_length": 188.75, + "epoch": 0.3805, + "grad_norm": 0.5941595435142517, + "kl": 0.06788572669029236, + "learning_rate": 3.894362772449226e-06, + "loss": 0.0027, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1522 + }, + { + "completion_length": 142.875, + "epoch": 0.38075, + "grad_norm": 0.7482618689537048, + "kl": 0.05281698703765869, + "learning_rate": 3.892551435844242e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1523 + }, + { + "completion_length": 122.625, + "epoch": 0.381, + "grad_norm": 0.6537103652954102, + "kl": 0.03658046945929527, + "learning_rate": 3.890739038750763e-06, + "loss": 0.0015, + "reward": 1.5199999809265137, + "reward_std": 0.513308584690094, + "rewards/_accuracy_reward": 0.5199999809265137, + "rewards/_format_reward": 1.0, + "step": 1524 + }, + { + "completion_length": 94.625, + "epoch": 0.38125, + "grad_norm": 0.024736473336815834, + "kl": 0.029560457915067673, + "learning_rate": 3.888925582549006e-06, + "loss": 0.0012, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1525 + }, + { + "completion_length": 164.75, + "epoch": 0.3815, + "grad_norm": 0.03351657837629318, + "kl": 0.06395187973976135, + "learning_rate": 3.887111068619999e-06, + "loss": 0.0026, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1526 + }, + { + "completion_length": 129.5, + "epoch": 0.38175, + "grad_norm": 0.6471255421638489, + "kl": 0.08158797770738602, + "learning_rate": 3.885295498345572e-06, + "loss": 0.0033, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1527 + }, + { + "completion_length": 178.25, + "epoch": 0.382, + "grad_norm": 0.6274938583374023, + "kl": 0.04700816795229912, + "learning_rate": 3.88347887310836e-06, + "loss": 0.0019, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1528 + }, + { + "completion_length": 132.125, + "epoch": 0.38225, + "grad_norm": 0.7165562510490417, + "kl": 0.058674897998571396, + "learning_rate": 3.881661194291805e-06, + "loss": 0.0023, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1529 + }, + { + "completion_length": 162.25, + "epoch": 0.3825, + "grad_norm": 0.027459675446152687, + "kl": 0.06130118668079376, + "learning_rate": 3.879842463280146e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1530 + }, + { + "completion_length": 150.375, + "epoch": 0.38275, + "grad_norm": 0.031655825674533844, + "kl": 0.05282329395413399, + "learning_rate": 3.8780226814584265e-06, + "loss": 0.0021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1531 + }, + { + "completion_length": 187.75, + "epoch": 0.383, + "grad_norm": 0.5729331374168396, + "kl": 0.06790616363286972, + "learning_rate": 3.876201850212489e-06, + "loss": 0.0027, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1532 + }, + { + "completion_length": 165.125, + "epoch": 0.38325, + "grad_norm": 0.02087417244911194, + "kl": 0.038140442222356796, + "learning_rate": 3.874379970928977e-06, + "loss": 0.0015, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1533 + }, + { + "completion_length": 182.5, + "epoch": 0.3835, + "grad_norm": 0.5951360464096069, + "kl": 0.048477813601493835, + "learning_rate": 3.87255704499533e-06, + "loss": 0.0019, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1534 + }, + { + "completion_length": 123.5, + "epoch": 0.38375, + "grad_norm": 0.7089921832084656, + "kl": 0.04717721790075302, + "learning_rate": 3.870733073799785e-06, + "loss": 0.0019, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1535 + }, + { + "completion_length": 182.75, + "epoch": 0.384, + "grad_norm": 0.029666420072317123, + "kl": 0.06745561957359314, + "learning_rate": 3.868908058731376e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1536 + }, + { + "completion_length": 172.25, + "epoch": 0.38425, + "grad_norm": 0.719944953918457, + "kl": 0.05279861390590668, + "learning_rate": 3.867082001179932e-06, + "loss": 0.0021, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1537 + }, + { + "completion_length": 162.25, + "epoch": 0.3845, + "grad_norm": 0.023668771609663963, + "kl": 0.047101955860853195, + "learning_rate": 3.865254902536073e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1538 + }, + { + "completion_length": 154.125, + "epoch": 0.38475, + "grad_norm": 0.5054484009742737, + "kl": 0.05012737214565277, + "learning_rate": 3.863426764191216e-06, + "loss": 0.002, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1539 + }, + { + "completion_length": 187.5, + "epoch": 0.385, + "grad_norm": 0.5208625793457031, + "kl": 0.04712379723787308, + "learning_rate": 3.861597587537568e-06, + "loss": 0.0019, + "reward": 1.2687499523162842, + "reward_std": 0.6999680995941162, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.75, + "step": 1540 + }, + { + "completion_length": 117.5, + "epoch": 0.38525, + "grad_norm": 0.6336519718170166, + "kl": 0.07700920104980469, + "learning_rate": 3.8597673739681265e-06, + "loss": 0.0031, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/_accuracy_reward": 0.8125, + "rewards/_format_reward": 1.0, + "step": 1541 + }, + { + "completion_length": 190.375, + "epoch": 0.3855, + "grad_norm": 0.5653288960456848, + "kl": 0.05022215470671654, + "learning_rate": 3.857936124876677e-06, + "loss": 0.002, + "reward": 1.3937499523162842, + "reward_std": 0.7336004972457886, + "rewards/_accuracy_reward": 0.5187499523162842, + "rewards/_format_reward": 0.875, + "step": 1542 + }, + { + "completion_length": 174.25, + "epoch": 0.38575, + "grad_norm": 0.7070077061653137, + "kl": 0.056165359914302826, + "learning_rate": 3.856103841657797e-06, + "loss": 0.0022, + "reward": 1.756250023841858, + "reward_std": 0.45153507590293884, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 0.875, + "step": 1543 + }, + { + "completion_length": 190.625, + "epoch": 0.386, + "grad_norm": 0.03828784078359604, + "kl": 0.0686645433306694, + "learning_rate": 3.85427052570685e-06, + "loss": 0.0027, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1544 + }, + { + "completion_length": 167.0, + "epoch": 0.38625, + "grad_norm": 0.5596043467521667, + "kl": 0.052377849817276, + "learning_rate": 3.8524361784199855e-06, + "loss": 0.0021, + "reward": 1.6262500286102295, + "reward_std": 0.7428312301635742, + "rewards/_accuracy_reward": 0.7512500286102295, + "rewards/_format_reward": 0.875, + "step": 1545 + }, + { + "completion_length": 158.0, + "epoch": 0.3865, + "grad_norm": 0.03330124169588089, + "kl": 0.03650851547718048, + "learning_rate": 3.850600801194138e-06, + "loss": 0.0015, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1546 + }, + { + "completion_length": 128.5, + "epoch": 0.38675, + "grad_norm": 0.5438423752784729, + "kl": 0.03633885830640793, + "learning_rate": 3.8487643954270274e-06, + "loss": 0.0015, + "reward": 1.8762500286102295, + "reward_std": 0.35001784563064575, + "rewards/_accuracy_reward": 0.8762500286102295, + "rewards/_format_reward": 1.0, + "step": 1547 + }, + { + "completion_length": 149.75, + "epoch": 0.387, + "grad_norm": 0.6018542647361755, + "kl": 0.05037950351834297, + "learning_rate": 3.846926962517158e-06, + "loss": 0.002, + "reward": 1.6437499523162842, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.6437499523162842, + "rewards/_format_reward": 1.0, + "step": 1548 + }, + { + "completion_length": 130.875, + "epoch": 0.38725, + "grad_norm": 0.7819284796714783, + "kl": 0.059603843837976456, + "learning_rate": 3.845088503863813e-06, + "loss": 0.0024, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 1549 + }, + { + "completion_length": 175.125, + "epoch": 0.3875, + "grad_norm": 0.5813125371932983, + "kl": 0.06442350149154663, + "learning_rate": 3.8432490208670605e-06, + "loss": 0.0026, + "reward": 1.0374999046325684, + "reward_std": 0.5350233912467957, + "rewards/_accuracy_reward": 0.16249999403953552, + "rewards/_format_reward": 0.875, + "step": 1550 + }, + { + "completion_length": 182.25, + "epoch": 0.38775, + "grad_norm": 0.6130213737487793, + "kl": 0.06105250120162964, + "learning_rate": 3.8414085149277445e-06, + "loss": 0.0024, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/_accuracy_reward": 0.65625, + "rewards/_format_reward": 0.75, + "step": 1551 + }, + { + "completion_length": 138.875, + "epoch": 0.388, + "grad_norm": 0.0351678803563118, + "kl": 0.05961094796657562, + "learning_rate": 3.839566987447492e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1552 + }, + { + "completion_length": 178.0, + "epoch": 0.38825, + "grad_norm": 0.5075997114181519, + "kl": 0.05547872185707092, + "learning_rate": 3.8377244398287065e-06, + "loss": 0.0022, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1553 + }, + { + "completion_length": 123.625, + "epoch": 0.3885, + "grad_norm": 0.6791297793388367, + "kl": 0.0307548139244318, + "learning_rate": 3.835880873474567e-06, + "loss": 0.0012, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1554 + }, + { + "completion_length": 159.5, + "epoch": 0.38875, + "grad_norm": 0.021484725177288055, + "kl": 0.04366430640220642, + "learning_rate": 3.83403628978903e-06, + "loss": 0.0017, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1555 + }, + { + "completion_length": 148.875, + "epoch": 0.389, + "grad_norm": 0.501158595085144, + "kl": 0.06199439615011215, + "learning_rate": 3.832190690176825e-06, + "loss": 0.0025, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1556 + }, + { + "completion_length": 142.75, + "epoch": 0.38925, + "grad_norm": 0.5822856426239014, + "kl": 0.04813998565077782, + "learning_rate": 3.830344076043459e-06, + "loss": 0.0019, + "reward": 1.7574999332427979, + "reward_std": 0.4491499960422516, + "rewards/_accuracy_reward": 0.7575000524520874, + "rewards/_format_reward": 1.0, + "step": 1557 + }, + { + "completion_length": 178.0, + "epoch": 0.3895, + "grad_norm": 0.517787516117096, + "kl": 0.07786907255649567, + "learning_rate": 3.828496448795208e-06, + "loss": 0.0031, + "reward": 1.65625, + "reward_std": 0.7188470363616943, + "rewards/_accuracy_reward": 0.78125, + "rewards/_format_reward": 0.875, + "step": 1558 + }, + { + "completion_length": 167.375, + "epoch": 0.38975, + "grad_norm": 0.5909892320632935, + "kl": 0.05643405765295029, + "learning_rate": 3.826647809839119e-06, + "loss": 0.0023, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1559 + }, + { + "completion_length": 133.125, + "epoch": 0.39, + "grad_norm": 0.6651085019111633, + "kl": 0.05972522869706154, + "learning_rate": 3.824798160583012e-06, + "loss": 0.0024, + "reward": 1.40625, + "reward_std": 0.49167174100875854, + "rewards/_accuracy_reward": 0.4062499701976776, + "rewards/_format_reward": 1.0, + "step": 1560 + }, + { + "completion_length": 102.75, + "epoch": 0.39025, + "grad_norm": 0.022245794534683228, + "kl": 0.039669353514909744, + "learning_rate": 3.822947502435477e-06, + "loss": 0.0016, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1561 + }, + { + "completion_length": 186.75, + "epoch": 0.3905, + "grad_norm": 0.6299328207969666, + "kl": 0.07042766362428665, + "learning_rate": 3.821095836805868e-06, + "loss": 0.0028, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.637499988079071, + "rewards/_format_reward": 0.875, + "step": 1562 + }, + { + "completion_length": 171.5, + "epoch": 0.39075, + "grad_norm": 0.026600031182169914, + "kl": 0.05926269665360451, + "learning_rate": 3.819243165104311e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1563 + }, + { + "completion_length": 126.625, + "epoch": 0.391, + "grad_norm": 0.6196228861808777, + "kl": 0.04901750385761261, + "learning_rate": 3.817389488741694e-06, + "loss": 0.002, + "reward": 1.631250023841858, + "reward_std": 0.738210916519165, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.875, + "step": 1564 + }, + { + "completion_length": 133.25, + "epoch": 0.39125, + "grad_norm": 0.6434496641159058, + "kl": 0.0704207792878151, + "learning_rate": 3.815534809129674e-06, + "loss": 0.0028, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1565 + }, + { + "completion_length": 95.375, + "epoch": 0.3915, + "grad_norm": 0.6863855719566345, + "kl": 0.07160824537277222, + "learning_rate": 3.8136791276806695e-06, + "loss": 0.0029, + "reward": 1.568750023841858, + "reward_std": 0.4689939618110657, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 0.875, + "step": 1566 + }, + { + "completion_length": 97.5, + "epoch": 0.39175, + "grad_norm": 0.8386607766151428, + "kl": 0.050130948424339294, + "learning_rate": 3.8118224458078633e-06, + "loss": 0.002, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/_accuracy_reward": 0.71875, + "rewards/_format_reward": 1.0, + "step": 1567 + }, + { + "completion_length": 185.625, + "epoch": 0.392, + "grad_norm": 0.7192432284355164, + "kl": 0.07506588846445084, + "learning_rate": 3.8099647649251984e-06, + "loss": 0.003, + "reward": 1.381250023841858, + "reward_std": 0.9133679866790771, + "rewards/_accuracy_reward": 0.6312500238418579, + "rewards/_format_reward": 0.75, + "step": 1568 + }, + { + "completion_length": 197.25, + "epoch": 0.39225, + "grad_norm": 0.5353832244873047, + "kl": 0.0634760856628418, + "learning_rate": 3.8081060864473794e-06, + "loss": 0.0025, + "reward": 1.1387499570846558, + "reward_std": 0.8324737548828125, + "rewards/_accuracy_reward": 0.38874998688697815, + "rewards/_format_reward": 0.75, + "step": 1569 + }, + { + "completion_length": 117.125, + "epoch": 0.3925, + "grad_norm": 0.019449777901172638, + "kl": 0.057256847620010376, + "learning_rate": 3.806246411789872e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1570 + }, + { + "completion_length": 195.25, + "epoch": 0.39275, + "grad_norm": 0.6402668356895447, + "kl": 0.0625515803694725, + "learning_rate": 3.8043857423688995e-06, + "loss": 0.0025, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/_accuracy_reward": 0.875, + "rewards/_format_reward": 0.875, + "step": 1571 + }, + { + "completion_length": 119.75, + "epoch": 0.393, + "grad_norm": 0.6904266476631165, + "kl": 0.04785230755805969, + "learning_rate": 3.802524079601442e-06, + "loss": 0.0019, + "reward": 1.693750023841858, + "reward_std": 0.4271479547023773, + "rewards/_accuracy_reward": 0.6937500238418579, + "rewards/_format_reward": 1.0, + "step": 1572 + }, + { + "completion_length": 130.75, + "epoch": 0.39325, + "grad_norm": 0.5492718815803528, + "kl": 0.039260704070329666, + "learning_rate": 3.8006614249052353e-06, + "loss": 0.0016, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1573 + }, + { + "completion_length": 161.25, + "epoch": 0.3935, + "grad_norm": 0.030837608501315117, + "kl": 0.07404318451881409, + "learning_rate": 3.798797779698774e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1574 + }, + { + "completion_length": 123.625, + "epoch": 0.39375, + "grad_norm": 0.02218150906264782, + "kl": 0.07550845295190811, + "learning_rate": 3.796933145401304e-06, + "loss": 0.003, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1575 + }, + { + "completion_length": 94.0, + "epoch": 0.394, + "grad_norm": 0.9086791276931763, + "kl": 0.055962108075618744, + "learning_rate": 3.795067523432826e-06, + "loss": 0.0022, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1576 + }, + { + "completion_length": 158.125, + "epoch": 0.39425, + "grad_norm": 0.7055941224098206, + "kl": 0.05105192959308624, + "learning_rate": 3.7932009152140926e-06, + "loss": 0.002, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1577 + }, + { + "completion_length": 153.875, + "epoch": 0.3945, + "grad_norm": 0.6992962956428528, + "kl": 0.06621918827295303, + "learning_rate": 3.791333322166605e-06, + "loss": 0.0026, + "reward": 1.5125000476837158, + "reward_std": 0.7467787861824036, + "rewards/_accuracy_reward": 0.6375000476837158, + "rewards/_format_reward": 0.875, + "step": 1578 + }, + { + "completion_length": 111.875, + "epoch": 0.39475, + "grad_norm": 0.025215838104486465, + "kl": 0.04421816021203995, + "learning_rate": 3.7894647457126188e-06, + "loss": 0.0018, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1579 + }, + { + "completion_length": 139.0, + "epoch": 0.395, + "grad_norm": 0.020408930256962776, + "kl": 0.04933981969952583, + "learning_rate": 3.787595187275136e-06, + "loss": 0.002, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1580 + }, + { + "completion_length": 165.5, + "epoch": 0.39525, + "grad_norm": 0.030607668682932854, + "kl": 0.060726772993803024, + "learning_rate": 3.7857246482779052e-06, + "loss": 0.0024, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1581 + }, + { + "completion_length": 150.875, + "epoch": 0.3955, + "grad_norm": 0.6549018621444702, + "kl": 0.0552009716629982, + "learning_rate": 3.7838531301454257e-06, + "loss": 0.0022, + "reward": 1.7625000476837158, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 1.0, + "step": 1582 + }, + { + "completion_length": 168.625, + "epoch": 0.39575, + "grad_norm": 0.7822985649108887, + "kl": 0.07485631853342056, + "learning_rate": 3.7819806343029373e-06, + "loss": 0.003, + "reward": 1.881250023841858, + "reward_std": 0.3358757197856903, + "rewards/_accuracy_reward": 0.8812500238418579, + "rewards/_format_reward": 1.0, + "step": 1583 + }, + { + "completion_length": 128.0, + "epoch": 0.396, + "grad_norm": 0.709174633026123, + "kl": 0.04844846948981285, + "learning_rate": 3.780107162176429e-06, + "loss": 0.0019, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1584 + }, + { + "completion_length": 137.75, + "epoch": 0.39625, + "grad_norm": 0.696996808052063, + "kl": 0.04638068005442619, + "learning_rate": 3.77823271519263e-06, + "loss": 0.0019, + "reward": 1.6387500762939453, + "reward_std": 0.49872517585754395, + "rewards/_accuracy_reward": 0.6387499570846558, + "rewards/_format_reward": 1.0, + "step": 1585 + }, + { + "completion_length": 124.5, + "epoch": 0.3965, + "grad_norm": 0.02669079601764679, + "kl": 0.05704856663942337, + "learning_rate": 3.776357294779015e-06, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1586 + }, + { + "completion_length": 163.375, + "epoch": 0.39675, + "grad_norm": 0.6654564738273621, + "kl": 0.0693705603480339, + "learning_rate": 3.774480902363795e-06, + "loss": 0.0028, + "reward": 1.4187500476837158, + "reward_std": 0.7235515117645264, + "rewards/_accuracy_reward": 0.5437500476837158, + "rewards/_format_reward": 0.875, + "step": 1587 + }, + { + "completion_length": 143.5, + "epoch": 0.397, + "grad_norm": 0.5335232615470886, + "kl": 0.04081644117832184, + "learning_rate": 3.772603539375929e-06, + "loss": 0.0016, + "reward": 1.6375000476837158, + "reward_std": 0.5005354285240173, + "rewards/_accuracy_reward": 0.762499988079071, + "rewards/_format_reward": 0.875, + "step": 1588 + }, + { + "completion_length": 113.625, + "epoch": 0.39725, + "grad_norm": 0.7147615551948547, + "kl": 0.04324035719037056, + "learning_rate": 3.770725207245106e-06, + "loss": 0.0017, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/_accuracy_reward": 0.90625, + "rewards/_format_reward": 1.0, + "step": 1589 + }, + { + "completion_length": 154.0, + "epoch": 0.3975, + "grad_norm": 0.6111875772476196, + "kl": 0.0737532302737236, + "learning_rate": 3.768845907401761e-06, + "loss": 0.003, + "reward": 1.2874999046325684, + "reward_std": 0.4397645592689514, + "rewards/_accuracy_reward": 0.28749996423721313, + "rewards/_format_reward": 1.0, + "step": 1590 + }, + { + "completion_length": 143.875, + "epoch": 0.39775, + "grad_norm": 0.6925716996192932, + "kl": 0.0703587755560875, + "learning_rate": 3.7669656412770605e-06, + "loss": 0.0028, + "reward": 1.0587499141693115, + "reward_std": 0.5271334052085876, + "rewards/_accuracy_reward": 0.1837500035762787, + "rewards/_format_reward": 0.875, + "step": 1591 + }, + { + "completion_length": 157.875, + "epoch": 0.398, + "grad_norm": 0.5909250974655151, + "kl": 0.04481671750545502, + "learning_rate": 3.7650844103029093e-06, + "loss": 0.0018, + "reward": 1.5199999809265137, + "reward_std": 0.5133086442947388, + "rewards/_accuracy_reward": 0.5199999809265137, + "rewards/_format_reward": 1.0, + "step": 1592 + }, + { + "completion_length": 173.75, + "epoch": 0.39825, + "grad_norm": 0.041420962661504745, + "kl": 0.07704256474971771, + "learning_rate": 3.763202215911948e-06, + "loss": 0.0031, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1593 + }, + { + "completion_length": 123.0, + "epoch": 0.3985, + "grad_norm": 0.033740244805812836, + "kl": 0.046596985310316086, + "learning_rate": 3.7613190595375484e-06, + "loss": 0.0019, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1594 + }, + { + "completion_length": 173.5, + "epoch": 0.39875, + "grad_norm": 0.6274866461753845, + "kl": 0.06995417177677155, + "learning_rate": 3.759434942613816e-06, + "loss": 0.0028, + "reward": 1.524999976158142, + "reward_std": 0.5077964067459106, + "rewards/_accuracy_reward": 0.5249999761581421, + "rewards/_format_reward": 1.0, + "step": 1595 + }, + { + "completion_length": 133.25, + "epoch": 0.399, + "grad_norm": 0.02433399297297001, + "kl": 0.06196637451648712, + "learning_rate": 3.7575498665755884e-06, + "loss": 0.0025, + "reward": 2.0, + "reward_std": 0.0, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 1.0, + "step": 1596 + }, + { + "completion_length": 171.75, + "epoch": 0.39925, + "grad_norm": 0.5644596815109253, + "kl": 0.07158312946557999, + "learning_rate": 3.7556638328584314e-06, + "loss": 0.0029, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/_accuracy_reward": 0.75, + "rewards/_format_reward": 0.75, + "step": 1597 + }, + { + "completion_length": 140.75, + "epoch": 0.3995, + "grad_norm": 0.6017252802848816, + "kl": 0.058312345296144485, + "learning_rate": 3.753776842898644e-06, + "loss": 0.0023, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/_accuracy_reward": 1.0, + "rewards/_format_reward": 0.875, + "step": 1598 + }, + { + "completion_length": 162.125, + "epoch": 0.39975, + "grad_norm": 0.6574504375457764, + "kl": 0.06590811908245087, + "learning_rate": 3.751888898133249e-06, + "loss": 0.0026, + "reward": 1.3125, + "reward_std": 0.4299086630344391, + "rewards/_accuracy_reward": 0.3124999701976776, + "rewards/_format_reward": 1.0, + "step": 1599 + }, + { + "completion_length": 179.0, + "epoch": 0.4, + "grad_norm": 0.533219575881958, + "kl": 0.07962776720523834, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0032, + "reward": 1.506250023841858, + "reward_std": 0.7513975501060486, + "rewards/_accuracy_reward": 0.7562500238418579, + "rewards/_format_reward": 0.75, + "step": 1600 + } + ], + "logging_steps": 1, + "max_steps": 4000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}