{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05596753882748006, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.75, "completions/max_terminated_length": 706.75, "completions/mean_length": 561.5, "completions/mean_terminated_length": 561.5, "completions/min_length": 304.25, "completions/min_terminated_length": 304.25, "epoch": 0.00011193507765496012, "grad_norm": 0.6556430486006678, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0315, "num_tokens": 39768.0, "reward": 0.02219326765043661, "reward_std": 0.04227059497497976, "rewards/code_reward/mean": 0.02219326765043661, "rewards/code_reward/std": 0.0422705952078104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.25, "completions/max_terminated_length": 609.25, "completions/mean_length": 333.3125, "completions/mean_terminated_length": 333.3125, "completions/min_length": 168.75, "completions/min_terminated_length": 168.75, "epoch": 0.00022387015530992023, "grad_norm": 0.9104023477920656, "kl": 0.0, "learning_rate": 3.3333333333333335e-07, "loss": -0.0684, "num_tokens": 62690.0, "reward": 0.09372148709371686, "reward_std": 0.05263180285692215, "rewards/code_reward/mean": 0.09372148709371686, "rewards/code_reward/std": 0.052631803788244724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.25, "completions/max_terminated_length": 613.25, "completions/mean_length": 431.03125, "completions/mean_terminated_length": 431.03125, "completions/min_length": 231.5, "completions/min_terminated_length": 231.5, "epoch": 0.00033580523296488035, "grad_norm": 0.7467116220042507, "kl": 6.216764450073242e-05, "learning_rate": 6.666666666666667e-07, "loss": 0.0176, "num_tokens": 93451.0, "reward": 0.02661502081900835, "reward_std": 0.03690493572503328, "rewards/code_reward/mean": 0.02661502081900835, "rewards/code_reward/std": 0.03690493851900101, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 379.625, "completions/mean_terminated_length": 379.625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.00044774031061984047, "grad_norm": 0.9821405105801766, "kl": 7.545948028564453e-05, "learning_rate": 1.0000000000000002e-06, "loss": -0.0724, "num_tokens": 120351.0, "reward": 0.07474911026656628, "reward_std": 0.10846673045307398, "rewards/code_reward/mean": 0.07474911026656628, "rewards/code_reward/std": 0.1084667295217514, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.25, "completions/max_terminated_length": 699.25, "completions/mean_length": 458.15625, "completions/mean_terminated_length": 458.15625, "completions/min_length": 172.75, "completions/min_terminated_length": 172.75, "epoch": 0.0005596753882748006, "grad_norm": 0.7621492143305323, "kl": 4.2319297790527344e-05, "learning_rate": 1.3333333333333334e-06, "loss": -0.022, "num_tokens": 155684.0, "reward": 0.021726191509515047, "reward_std": 0.02874244563281536, "rewards/code_reward/mean": 0.021726191509515047, "rewards/code_reward/std": 0.02874244749546051, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.5, "completions/max_terminated_length": 764.5, "completions/mean_length": 439.15625, "completions/mean_terminated_length": 439.15625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.0006716104659297607, "grad_norm": 1.0850029834641424, "kl": 4.9442052841186523e-05, "learning_rate": 1.6666666666666667e-06, "loss": 0.0318, "num_tokens": 189689.0, "reward": 0.36408869456499815, "reward_std": 0.2700020968914032, "rewards/code_reward/mean": 0.36408869456499815, "rewards/code_reward/std": 0.2700021122582257, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.75, "completions/max_terminated_length": 783.75, "completions/mean_length": 459.84375, "completions/mean_terminated_length": 459.84375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.0007835455435847208, "grad_norm": 1.1358452129000465, "kl": 9.28044319152832e-05, "learning_rate": 2.0000000000000003e-06, "loss": -0.0229, "num_tokens": 223804.0, "reward": 0.07010709377937019, "reward_std": 0.09961615316569805, "rewards/code_reward/mean": 0.07010709377937019, "rewards/code_reward/std": 0.09961615689098835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.25, "completions/max_terminated_length": 560.25, "completions/mean_length": 390.28125, "completions/mean_terminated_length": 390.28125, "completions/min_length": 230.25, "completions/min_terminated_length": 230.25, "epoch": 0.0008954806212396809, "grad_norm": 1.0894200859707648, "kl": 6.967782974243164e-05, "learning_rate": 2.3333333333333336e-06, "loss": -0.0832, "num_tokens": 250717.0, "reward": 0.17307570209959522, "reward_std": 0.17698997142724693, "rewards/code_reward/mean": 0.17307570209959522, "rewards/code_reward/std": 0.17698997911065817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.5, "completions/max_terminated_length": 759.5, "completions/mean_length": 530.4375, "completions/mean_terminated_length": 530.4375, "completions/min_length": 260.25, "completions/min_terminated_length": 260.25, "epoch": 0.0010074156988946412, "grad_norm": 0.6609406173759865, "kl": 0.00013196468353271484, "learning_rate": 2.666666666666667e-06, "loss": 0.0563, "num_tokens": 290563.0, "reward": 0.0257048096973449, "reward_std": 0.036920994287356734, "rewards/code_reward/mean": 0.0257048096973449, "rewards/code_reward/std": 0.03692099452018738, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.25, "completions/max_terminated_length": 749.25, "completions/mean_length": 468.71875, "completions/mean_terminated_length": 468.71875, "completions/min_length": 277.25, "completions/min_terminated_length": 277.25, "epoch": 0.0011193507765496012, "grad_norm": 0.816974823658789, "kl": 0.00015497207641601562, "learning_rate": 3e-06, "loss": -0.0226, "num_tokens": 325418.0, "reward": 0.09049492585472763, "reward_std": 0.18241150537505746, "rewards/code_reward/mean": 0.09049492585472763, "rewards/code_reward/std": 0.18241151235997677, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.5, "completions/max_terminated_length": 523.5, "completions/mean_length": 360.09375, "completions/mean_terminated_length": 360.09375, "completions/min_length": 172.75, "completions/min_terminated_length": 172.75, "epoch": 0.0012312858542045614, "grad_norm": 1.3579676028203136, "kl": 0.0004100799560546875, "learning_rate": 3.3333333333333333e-06, "loss": 0.0149, "num_tokens": 348085.0, "reward": 0.07968997955322266, "reward_std": 0.19473078846931458, "rewards/code_reward/mean": 0.07968997955322266, "rewards/code_reward/std": 0.19473078846931458, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.25, "completions/max_terminated_length": 719.25, "completions/mean_length": 476.03125, "completions/mean_terminated_length": 476.03125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.0013432209318595214, "grad_norm": 1.0254632341024847, "kl": 0.00038051605224609375, "learning_rate": 3.6666666666666666e-06, "loss": -0.0456, "num_tokens": 379078.0, "reward": 0.06544117676094174, "reward_std": 0.11873381165787578, "rewards/code_reward/mean": 0.06544117676094174, "rewards/code_reward/std": 0.11873381165787578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.75, "completions/max_terminated_length": 643.75, "completions/mean_length": 456.40625, "completions/mean_terminated_length": 456.40625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.0014551560095144816, "grad_norm": 0.7844627176055754, "kl": 0.0006098747253417969, "learning_rate": 4.000000000000001e-06, "loss": 0.0275, "num_tokens": 410643.0, "reward": 0.08870427880901843, "reward_std": 0.1394934863783419, "rewards/code_reward/mean": 0.08870427880901843, "rewards/code_reward/std": 0.13949348265305161, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 946.5, "completions/max_terminated_length": 779.25, "completions/mean_length": 474.78125, "completions/mean_terminated_length": 433.4776916503906, "completions/min_length": 212.5, "completions/min_terminated_length": 212.5, "epoch": 0.0015670910871694416, "grad_norm": 1.0068663809227436, "kl": 0.001148223876953125, "learning_rate": 4.333333333333334e-06, "loss": -0.0596, "num_tokens": 444292.0, "reward": 0.09596806723857298, "reward_std": 0.1574952198425308, "rewards/code_reward/mean": 0.09596806723857298, "rewards/code_reward/std": 0.15749522170517594, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.75, "completions/max_terminated_length": 754.75, "completions/mean_length": 491.375, "completions/mean_terminated_length": 491.375, "completions/min_length": 320.5, "completions/min_terminated_length": 320.5, "epoch": 0.0016790261648244019, "grad_norm": 0.6616890996356674, "kl": 0.0012340545654296875, "learning_rate": 4.666666666666667e-06, "loss": 0.0086, "num_tokens": 480536.0, "reward": 0.04718137255986221, "reward_std": 0.06556019705021754, "rewards/code_reward/mean": 0.04718137255986221, "rewards/code_reward/std": 0.0655602045590058, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 434.03125, "completions/mean_terminated_length": 434.03125, "completions/min_length": 225.25, "completions/min_terminated_length": 225.25, "epoch": 0.0017909612424793619, "grad_norm": 0.8195686424785313, "kl": 0.0021371841430664062, "learning_rate": 5e-06, "loss": -0.0293, "num_tokens": 517233.0, "reward": 0.14742368459701538, "reward_std": 0.13962780684232712, "rewards/code_reward/mean": 0.14742368459701538, "rewards/code_reward/std": 0.13962781056761742, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.25, "completions/max_terminated_length": 670.25, "completions/mean_length": 466.125, "completions/mean_terminated_length": 466.125, "completions/min_length": 264.25, "completions/min_terminated_length": 264.25, "epoch": 0.001902896320134322, "grad_norm": 0.7604607536894525, "kl": 0.00327301025390625, "learning_rate": 4.999952797253148e-06, "loss": -0.0188, "num_tokens": 552989.0, "reward": 0.03991336654871702, "reward_std": 0.05816664919257164, "rewards/code_reward/mean": 0.03991336654871702, "rewards/code_reward/std": 0.05816664732992649, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 413.21875, "completions/mean_terminated_length": 413.21875, "completions/min_length": 243.25, "completions/min_terminated_length": 243.25, "epoch": 0.0020148313977892823, "grad_norm": 0.8446607287984066, "kl": 0.00414276123046875, "learning_rate": 4.9998111909931225e-06, "loss": -0.0223, "num_tokens": 582628.0, "reward": 0.027369487448595464, "reward_std": 0.05444430746138096, "rewards/code_reward/mean": 0.027369487448595464, "rewards/code_reward/std": 0.05444430839270353, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1086.75, "completions/max_terminated_length": 799.25, "completions/mean_length": 567.1875, "completions/mean_terminated_length": 521.8258972167969, "completions/min_length": 304.5, "completions/min_terminated_length": 304.5, "epoch": 0.0021267664754442426, "grad_norm": 0.6815859244268851, "kl": 0.00553131103515625, "learning_rate": 4.999575187161439e-06, "loss": 0.0078, "num_tokens": 620538.0, "reward": 0.03672935510985553, "reward_std": 0.029647217132151127, "rewards/code_reward/mean": 0.03672935510985553, "rewards/code_reward/std": 0.029647217132151127, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.25, "completions/max_terminated_length": 805.25, "completions/mean_length": 507.21875, "completions/mean_terminated_length": 507.21875, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.0022387015530992023, "grad_norm": 0.4364759611369158, "kl": 0.00673675537109375, "learning_rate": 4.9992447956603455e-06, "loss": 0.0085, "num_tokens": 659145.0, "reward": 0.00214460794813931, "reward_std": 0.006065867375582457, "rewards/code_reward/mean": 0.00214460794813931, "rewards/code_reward/std": 0.006065867375582457, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.25, "completions/max_terminated_length": 488.25, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 191.5, "completions/min_terminated_length": 191.5, "epoch": 0.0023506366307541626, "grad_norm": 1.1196346766853063, "kl": 0.009761810302734375, "learning_rate": 4.998820030352409e-06, "loss": -0.0401, "num_tokens": 681841.0, "reward": 0.21865647949744016, "reward_std": 0.16933009633794427, "rewards/code_reward/mean": 0.21865647949744016, "rewards/code_reward/std": 0.16933008842170238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1162.75, "completions/max_terminated_length": 852.25, "completions/mean_length": 618.125, "completions/mean_terminated_length": 572.4866180419922, "completions/min_length": 325.75, "completions/min_terminated_length": 325.75, "epoch": 0.002462571708409123, "grad_norm": 0.8324675842567315, "kl": 0.00992584228515625, "learning_rate": 4.998300909059929e-06, "loss": 0.0359, "num_tokens": 722261.0, "reward": 0.03132503107190132, "reward_std": 0.04826245130971074, "rewards/code_reward/mean": 0.03132503107190132, "rewards/code_reward/std": 0.048262451542541385, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.5, "completions/max_terminated_length": 668.5, "completions/mean_length": 449.5625, "completions/mean_terminated_length": 449.5625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.002574506786064083, "grad_norm": 0.8188216899529877, "kl": 0.012542724609375, "learning_rate": 4.997687453564198e-06, "loss": -0.0757, "num_tokens": 756751.0, "reward": 0.0606757253408432, "reward_std": 0.08386795781552792, "rewards/code_reward/mean": 0.0606757253408432, "rewards/code_reward/std": 0.08386795967817307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 397.21875, "completions/mean_terminated_length": 397.21875, "completions/min_length": 221.5, "completions/min_terminated_length": 221.5, "epoch": 0.002686441863719043, "grad_norm": 0.9882922552317236, "kl": 0.020965576171875, "learning_rate": 4.9969796896045775e-06, "loss": -0.0241, "num_tokens": 784286.0, "reward": 0.03399203496519476, "reward_std": 0.06442949129268527, "rewards/code_reward/mean": 0.03399203496519476, "rewards/code_reward/std": 0.06442949641495943, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.75, "completions/max_terminated_length": 692.75, "completions/mean_length": 500.4375, "completions/mean_terminated_length": 500.4375, "completions/min_length": 286.75, "completions/min_terminated_length": 286.75, "epoch": 0.002798376941374003, "grad_norm": 1.0525741278330438, "kl": 0.0211029052734375, "learning_rate": 4.996177646877426e-06, "loss": 0.0871, "num_tokens": 818260.0, "reward": 0.1732453762087971, "reward_std": 0.19011690141633153, "rewards/code_reward/mean": 0.1732453762087971, "rewards/code_reward/std": 0.19011691492050886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.5, "completions/max_terminated_length": 651.5, "completions/mean_length": 481.875, "completions/mean_terminated_length": 481.875, "completions/min_length": 325.25, "completions/min_terminated_length": 325.25, "epoch": 0.0029103120190289633, "grad_norm": 0.8323803477726922, "kl": 0.02197265625, "learning_rate": 4.995281359034851e-06, "loss": -0.0176, "num_tokens": 858344.0, "reward": 0.07887662292341702, "reward_std": 0.06681955535896122, "rewards/code_reward/mean": 0.07887662292341702, "rewards/code_reward/std": 0.06681956280954182, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1061.75, "completions/max_terminated_length": 923.25, "completions/mean_length": 631.0, "completions/mean_terminated_length": 599.9107360839844, "completions/min_length": 343.5, "completions/min_terminated_length": 343.5, "epoch": 0.0030222470966839235, "grad_norm": 0.758606194839006, "kl": 0.01599884033203125, "learning_rate": 4.994290863683296e-06, "loss": -0.0909, "num_tokens": 900352.0, "reward": 0.019329323433339596, "reward_std": 0.05079583264887333, "rewards/code_reward/mean": 0.019329323433339596, "rewards/code_reward/std": 0.05079583264887333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1012.75, "completions/max_terminated_length": 680.75, "completions/mean_length": 559.90625, "completions/mean_terminated_length": 515.2723236083984, "completions/min_length": 366.5, "completions/min_terminated_length": 366.5, "epoch": 0.0031341821743388833, "grad_norm": 0.6052468908705184, "kl": 0.019195556640625, "learning_rate": 4.99320620238196e-06, "loss": -0.0222, "num_tokens": 934653.0, "reward": 0.3267045458778739, "reward_std": 0.13263714499771595, "rewards/code_reward/mean": 0.3267045458778739, "rewards/code_reward/std": 0.1326371468603611, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.25, "completions/max_terminated_length": 755.25, "completions/mean_length": 560.125, "completions/mean_terminated_length": 560.125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.0032461172519938435, "grad_norm": 0.7871054475315462, "kl": 0.026947021484375, "learning_rate": 4.99202742064106e-06, "loss": -0.0025, "num_tokens": 963673.0, "reward": 0.1587616038741544, "reward_std": 0.16337263770401478, "rewards/code_reward/mean": 0.1587616038741544, "rewards/code_reward/std": 0.16337264538742602, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 521.5625, "completions/mean_terminated_length": 521.5625, "completions/min_length": 357.5, "completions/min_terminated_length": 357.5, "epoch": 0.0033580523296488037, "grad_norm": 0.7847358134959852, "kl": 0.0301513671875, "learning_rate": 4.990754567919917e-06, "loss": -0.0139, "num_tokens": 1001035.0, "reward": 0.06875000009313226, "reward_std": 0.08176466450095177, "rewards/code_reward/mean": 0.06875000009313226, "rewards/code_reward/std": 0.08176466636359692, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.75, "completions/max_terminated_length": 795.75, "completions/mean_length": 541.53125, "completions/mean_terminated_length": 541.53125, "completions/min_length": 275.75, "completions/min_terminated_length": 275.75, "epoch": 0.003469987407303764, "grad_norm": 0.7341146031808772, "kl": 0.038360595703125, "learning_rate": 4.989387697624881e-06, "loss": -0.0057, "num_tokens": 1032172.0, "reward": 0.12754360469989479, "reward_std": 0.13973576435819268, "rewards/code_reward/mean": 0.12754360469989479, "rewards/code_reward/std": 0.13973576435819268, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.25, "completions/max_terminated_length": 855.25, "completions/mean_length": 615.25, "completions/mean_terminated_length": 615.25, "completions/min_length": 476.5, "completions/min_terminated_length": 476.5, "epoch": 0.0035819224849587238, "grad_norm": 0.6710232004425948, "kl": 0.03857421875, "learning_rate": 4.987926867107095e-06, "loss": 0.0172, "num_tokens": 1071724.0, "reward": 0.04409082653000951, "reward_std": 0.08446959964931011, "rewards/code_reward/mean": 0.04409082653000951, "rewards/code_reward/std": 0.08446960058063269, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.25, "completions/max_terminated_length": 600.25, "completions/mean_length": 480.375, "completions/mean_terminated_length": 480.375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.003693857562613684, "grad_norm": 0.9880874575660351, "kl": 0.045196533203125, "learning_rate": 4.986372137660078e-06, "loss": 0.0123, "num_tokens": 1105880.0, "reward": 0.2375063351355493, "reward_std": 0.21191274048760533, "rewards/code_reward/mean": 0.2375063351355493, "rewards/code_reward/std": 0.21191274095326662, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.25, "completions/max_terminated_length": 810.25, "completions/mean_length": 586.9375, "completions/mean_terminated_length": 586.9375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.003805792640268644, "grad_norm": 0.7360918797059441, "kl": 0.036865234375, "learning_rate": 4.984723574517165e-06, "loss": 0.0507, "num_tokens": 1137102.0, "reward": 0.1048327736207284, "reward_std": 0.12198017432820052, "rewards/code_reward/mean": 0.1048327736207284, "rewards/code_reward/std": 0.12198018177878112, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.25, "completions/max_terminated_length": 671.25, "completions/mean_length": 481.6875, "completions/mean_terminated_length": 481.6875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.0039177277179236044, "grad_norm": 0.8321841584943117, "kl": 0.04937744140625, "learning_rate": 4.9829812468487655e-06, "loss": 0.0045, "num_tokens": 1169516.0, "reward": 0.13488754630088806, "reward_std": 0.15473865950480103, "rewards/code_reward/mean": 0.13488754630088806, "rewards/code_reward/std": 0.15473866136744618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.25, "completions/max_terminated_length": 758.25, "completions/mean_length": 503.5625, "completions/mean_terminated_length": 503.5625, "completions/min_length": 332.75, "completions/min_terminated_length": 332.75, "epoch": 0.004029662795578565, "grad_norm": 0.6486333735221288, "kl": 0.04339599609375, "learning_rate": 4.981145227759457e-06, "loss": 0.032, "num_tokens": 1200070.0, "reward": 0.1878063678741455, "reward_std": 0.2491721287369728, "rewards/code_reward/mean": 0.1878063678741455, "rewards/code_reward/std": 0.2491721287369728, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 447.4375, "completions/mean_terminated_length": 447.4375, "completions/min_length": 265.75, "completions/min_terminated_length": 265.75, "epoch": 0.004141597873233525, "grad_norm": 0.7244692818471474, "kl": 0.06103515625, "learning_rate": 4.979215594284924e-06, "loss": 0.0021, "num_tokens": 1226348.0, "reward": 0.20452898740768433, "reward_std": 0.11701454967260361, "rewards/code_reward/mean": 0.20452898740768433, "rewards/code_reward/std": 0.1170145571231842, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 693.0625, "completions/mean_terminated_length": 693.0625, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.004253532950888485, "grad_norm": 0.849851302505875, "kl": 0.04730224609375, "learning_rate": 4.977192427388722e-06, "loss": 0.0179, "num_tokens": 1269150.0, "reward": 0.27298991987481713, "reward_std": 0.22980366041883826, "rewards/code_reward/mean": 0.27298991987481713, "rewards/code_reward/std": 0.22980366088449955, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.75, "completions/max_terminated_length": 683.75, "completions/mean_length": 558.6875, "completions/mean_terminated_length": 558.6875, "completions/min_length": 429.5, "completions/min_terminated_length": 429.5, "epoch": 0.0043654680285434445, "grad_norm": 0.7007952533946012, "kl": 0.0660400390625, "learning_rate": 4.9750758119588824e-06, "loss": -0.0113, "num_tokens": 1303044.0, "reward": 0.09943000599741936, "reward_std": 0.06831434741616249, "rewards/code_reward/mean": 0.09943000599741936, "rewards/code_reward/std": 0.06831434927880764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.25, "completions/max_terminated_length": 757.25, "completions/mean_length": 580.4375, "completions/mean_terminated_length": 580.4375, "completions/min_length": 434.5, "completions/min_terminated_length": 434.5, "epoch": 0.004477403106198405, "grad_norm": 0.8837854421885906, "kl": 0.0677490234375, "learning_rate": 4.972865836804349e-06, "loss": 0.036, "num_tokens": 1341114.0, "reward": 0.06429215706884861, "reward_std": 0.10811880882829428, "rewards/code_reward/mean": 0.06429215706884861, "rewards/code_reward/std": 0.10811881255358458, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.25, "completions/max_terminated_length": 751.25, "completions/mean_length": 564.15625, "completions/mean_terminated_length": 564.15625, "completions/min_length": 286.25, "completions/min_terminated_length": 286.25, "epoch": 0.004589338183853365, "grad_norm": 0.8908962386984877, "kl": 0.05224609375, "learning_rate": 4.970562594651254e-06, "loss": 0.0452, "num_tokens": 1374855.0, "reward": 0.0228785730432719, "reward_std": 0.06361539242789149, "rewards/code_reward/mean": 0.0228785730432719, "rewards/code_reward/std": 0.06361539429053664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.25, "completions/max_terminated_length": 771.25, "completions/mean_length": 548.25, "completions/mean_terminated_length": 548.25, "completions/min_length": 359.5, "completions/min_terminated_length": 359.5, "epoch": 0.004701273261508325, "grad_norm": 0.6625726727295415, "kl": 0.06036376953125, "learning_rate": 4.968166182139026e-06, "loss": 0.0256, "num_tokens": 1408383.0, "reward": 0.12928921589627862, "reward_std": 0.1418905109167099, "rewards/code_reward/mean": 0.12928921589627862, "rewards/code_reward/std": 0.1418905109167099, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.5, "completions/max_terminated_length": 734.5, "completions/mean_length": 542.28125, "completions/mean_terminated_length": 542.28125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.004813208339163285, "grad_norm": 0.6465061355449234, "kl": 0.06768798828125, "learning_rate": 4.9656766998163306e-06, "loss": 0.0207, "num_tokens": 1446992.0, "reward": 0.10072244703769684, "reward_std": 0.11095328629016876, "rewards/code_reward/mean": 0.10072244703769684, "rewards/code_reward/std": 0.11095329001545906, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1169.75, "completions/max_terminated_length": 960.0, "completions/mean_length": 709.5625, "completions/mean_terminated_length": 673.1160888671875, "completions/min_length": 441.75, "completions/min_terminated_length": 441.75, "epoch": 0.004925143416818246, "grad_norm": 0.5674013554100908, "kl": 0.05987548828125, "learning_rate": 4.963094252136865e-06, "loss": 0.0051, "num_tokens": 1489002.0, "reward": 0.026442307978868484, "reward_std": 0.02707473188638687, "rewards/code_reward/mean": 0.026442307978868484, "rewards/code_reward/std": 0.027074730023741722, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1275.0, "completions/max_terminated_length": 1007.25, "completions/mean_length": 751.75, "completions/mean_terminated_length": 665.2291717529297, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.005037078494473206, "grad_norm": 0.485271480797118, "kl": 0.04827880859375, "learning_rate": 4.960418947454958e-06, "loss": 0.0172, "num_tokens": 1535994.0, "reward": 0.0928819477558136, "reward_std": 0.059048041701316833, "rewards/code_reward/mean": 0.0928819477558136, "rewards/code_reward/std": 0.059048037976026535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.25, "completions/max_terminated_length": 862.25, "completions/mean_length": 557.3125, "completions/mean_terminated_length": 557.3125, "completions/min_length": 322.75, "completions/min_terminated_length": 322.75, "epoch": 0.005149013572128166, "grad_norm": 0.5321904564182419, "kl": 0.0755615234375, "learning_rate": 4.957650898021038e-06, "loss": 0.0296, "num_tokens": 1574756.0, "reward": 0.001838235417380929, "reward_std": 0.0021725620608776808, "rewards/code_reward/mean": 0.001838235417380929, "rewards/code_reward/std": 0.0021725620608776808, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.75, "completions/max_terminated_length": 822.75, "completions/mean_length": 576.875, "completions/mean_terminated_length": 576.875, "completions/min_length": 400.25, "completions/min_terminated_length": 400.25, "epoch": 0.005260948649783125, "grad_norm": 0.936192416992597, "kl": 0.07989501953125, "learning_rate": 4.954790219976915e-06, "loss": 0.0045, "num_tokens": 1610288.0, "reward": 0.1439773216843605, "reward_std": 0.18783001974225044, "rewards/code_reward/mean": 0.1439773216843605, "rewards/code_reward/std": 0.1878300216048956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1394.5, "completions/max_terminated_length": 718.5, "completions/mean_length": 634.0, "completions/mean_terminated_length": 540.5089416503906, "completions/min_length": 364.25, "completions/min_terminated_length": 364.25, "epoch": 0.005372883727438086, "grad_norm": 0.7287543485940715, "kl": 0.0684814453125, "learning_rate": 4.95183703335091e-06, "loss": -0.0097, "num_tokens": 1643344.0, "reward": 0.06508574914187193, "reward_std": 0.14080872386693954, "rewards/code_reward/mean": 0.06508574914187193, "rewards/code_reward/std": 0.14080872386693954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1348.0, "completions/max_terminated_length": 1062.25, "completions/mean_length": 779.1875, "completions/mean_terminated_length": 739.1562652587891, "completions/min_length": 460.5, "completions/min_terminated_length": 460.5, "epoch": 0.005484818805093046, "grad_norm": 0.5005111196440745, "kl": 0.0516357421875, "learning_rate": 4.948791462052819e-06, "loss": 0.0943, "num_tokens": 1697622.0, "reward": 0.03162594046443701, "reward_std": 0.04561105836182833, "rewards/code_reward/mean": 0.03162594046443701, "rewards/code_reward/std": 0.0456110592931509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 550.5, "completions/mean_terminated_length": 550.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.005596753882748006, "grad_norm": 0.6600509702450322, "kl": 0.080078125, "learning_rate": 4.945653633868716e-06, "loss": -0.0276, "num_tokens": 1736334.0, "reward": 0.020256503019481897, "reward_std": 0.05186595255509019, "rewards/code_reward/mean": 0.020256503019481897, "rewards/code_reward/std": 0.05186595278792083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 716.1875, "completions/mean_terminated_length": 716.1875, "completions/min_length": 532.75, "completions/min_terminated_length": 532.75, "epoch": 0.005708688960402966, "grad_norm": 0.7327612072681561, "kl": 0.07684326171875, "learning_rate": 4.942423680455584e-06, "loss": 0.0245, "num_tokens": 1782508.0, "reward": 0.12191444495692849, "reward_std": 0.18840329442173243, "rewards/code_reward/mean": 0.12191444495692849, "rewards/code_reward/std": 0.18840329255908728, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1145.75, "completions/max_terminated_length": 794.5, "completions/mean_length": 594.21875, "completions/mean_terminated_length": 546.3482208251953, "completions/min_length": 401.5, "completions/min_terminated_length": 401.5, "epoch": 0.0058206240380579265, "grad_norm": 0.8852205460832024, "kl": 0.0771484375, "learning_rate": 4.939101737335802e-06, "loss": 0.0285, "num_tokens": 1817019.0, "reward": 0.2491304986178875, "reward_std": 0.283006114885211, "rewards/code_reward/mean": 0.2491304986178875, "rewards/code_reward/std": 0.2830061223357916, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 592.90625, "completions/mean_terminated_length": 592.90625, "completions/min_length": 387.5, "completions/min_terminated_length": 387.5, "epoch": 0.005932559115712887, "grad_norm": 0.9735177234540268, "kl": 0.090576171875, "learning_rate": 4.935687943891447e-06, "loss": 0.0151, "num_tokens": 1857712.0, "reward": 0.05411792593076825, "reward_std": 0.11774230282753706, "rewards/code_reward/mean": 0.05411792593076825, "rewards/code_reward/std": 0.11774230748414993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1032.25, "completions/max_terminated_length": 899.0, "completions/mean_length": 757.0, "completions/mean_terminated_length": 704.8645935058594, "completions/min_length": 557.75, "completions/min_terminated_length": 557.75, "epoch": 0.006044494193367847, "grad_norm": 0.528475149602509, "kl": 0.081939697265625, "learning_rate": 4.932182443358458e-06, "loss": 0.0384, "num_tokens": 1906368.0, "reward": 0.08750000596046448, "reward_std": 0.10493762046098709, "rewards/code_reward/mean": 0.08750000596046448, "rewards/code_reward/std": 0.10493762046098709, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1305.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 582.5, "completions/mean_terminated_length": 488.51341247558594, "completions/min_length": 289.25, "completions/min_terminated_length": 289.25, "epoch": 0.006156429271022806, "grad_norm": 0.9161325800813456, "kl": 0.06402587890625, "learning_rate": 4.928585382820616e-06, "loss": 0.0907, "num_tokens": 1939632.0, "reward": 0.17478298512287438, "reward_std": 0.1567421266809106, "rewards/code_reward/mean": 0.17478298512287438, "rewards/code_reward/std": 0.15674212691374123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1085.75, "completions/max_terminated_length": 757.0, "completions/mean_length": 603.28125, "completions/mean_terminated_length": 558.6964416503906, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.006268364348677767, "grad_norm": 0.8910583501574744, "kl": 0.07037353515625, "learning_rate": 4.924896913203376e-06, "loss": 0.0682, "num_tokens": 1979449.0, "reward": 0.10180415771901608, "reward_std": 0.13188505358994007, "rewards/code_reward/mean": 0.10180415771901608, "rewards/code_reward/std": 0.1318850601091981, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.5, "completions/max_terminated_length": 806.5, "completions/mean_length": 581.03125, "completions/mean_terminated_length": 581.03125, "completions/min_length": 407.5, "completions/min_terminated_length": 407.5, "epoch": 0.006380299426332727, "grad_norm": 1.03891552766019, "kl": 0.08428955078125, "learning_rate": 4.921117189267535e-06, "loss": -0.054, "num_tokens": 2018810.0, "reward": 0.1397020157892257, "reward_std": 0.1752215747255832, "rewards/code_reward/mean": 0.1397020157892257, "rewards/code_reward/std": 0.17522158951032907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 548.53125, "completions/mean_terminated_length": 548.53125, "completions/min_length": 340.25, "completions/min_terminated_length": 340.25, "epoch": 0.006492234503987687, "grad_norm": 0.5842843696982555, "kl": 0.0770263671875, "learning_rate": 4.917246369602742e-06, "loss": 0.0329, "num_tokens": 2054963.0, "reward": 0.10270743072032928, "reward_std": 0.14296946674585342, "rewards/code_reward/mean": 0.10270743072032928, "rewards/code_reward/std": 0.14296947047114372, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.75, "completions/max_terminated_length": 741.75, "completions/mean_length": 581.90625, "completions/mean_terminated_length": 581.90625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.006604169581642647, "grad_norm": 0.8753670558307941, "kl": 0.07373046875, "learning_rate": 4.9132846166208355e-06, "loss": 0.0316, "num_tokens": 2089368.0, "reward": 0.008248626545537263, "reward_std": 0.008944235043600202, "rewards/code_reward/mean": 0.008248626545537263, "rewards/code_reward/std": 0.008944234810769558, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.5, "completions/max_terminated_length": 706.5, "completions/mean_length": 519.03125, "completions/mean_terminated_length": 519.03125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.0067161046592976075, "grad_norm": 0.8925995523113797, "kl": 0.0953369140625, "learning_rate": 4.9092320965490365e-06, "loss": 0.0961, "num_tokens": 2119905.0, "reward": 0.11875520087778568, "reward_std": 0.2019189279526472, "rewards/code_reward/mean": 0.11875520087778568, "rewards/code_reward/std": 0.20191893354058266, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.75, "completions/max_terminated_length": 580.75, "completions/mean_length": 387.21875, "completions/mean_terminated_length": 387.21875, "completions/min_length": 223.5, "completions/min_terminated_length": 223.5, "epoch": 0.006828039736952568, "grad_norm": 0.7379134154137567, "kl": 0.0865478515625, "learning_rate": 4.905088979422971e-06, "loss": 0.0293, "num_tokens": 2148904.0, "reward": 0.021313310135155916, "reward_std": 0.018601362127810717, "rewards/code_reward/mean": 0.021313310135155916, "rewards/code_reward/std": 0.018601362593472004, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1017.5, "completions/max_terminated_length": 684.25, "completions/mean_length": 573.875, "completions/mean_terminated_length": 528.5982208251953, "completions/min_length": 326.25, "completions/min_terminated_length": 326.25, "epoch": 0.006939974814607528, "grad_norm": 0.8052241518254897, "kl": 0.0723876953125, "learning_rate": 4.900855439079536e-06, "loss": 0.0677, "num_tokens": 2190676.0, "reward": 0.055714288260787725, "reward_std": 0.04886896489188075, "rewards/code_reward/mean": 0.055714288260787725, "rewards/code_reward/std": 0.048868965124711394, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.75, "completions/max_terminated_length": 643.75, "completions/mean_length": 476.25, "completions/mean_terminated_length": 476.25, "completions/min_length": 310.5, "completions/min_terminated_length": 310.5, "epoch": 0.007051909892262488, "grad_norm": 0.4680852161414822, "kl": 0.08984375, "learning_rate": 4.8965316531496055e-06, "loss": 0.0257, "num_tokens": 2221412.0, "reward": 0.0011574074160307646, "reward_std": 0.0021431019995361567, "rewards/code_reward/mean": 0.0011574074160307646, "rewards/code_reward/std": 0.0021431019995361567, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1011.5, "completions/max_terminated_length": 646.75, "completions/mean_length": 617.40625, "completions/mean_terminated_length": 520.71875, "completions/min_length": 379.75, "completions/min_terminated_length": 379.75, "epoch": 0.0071638449699174475, "grad_norm": 0.8146582859250777, "kl": 0.0728759765625, "learning_rate": 4.892117803050578e-06, "loss": -0.0054, "num_tokens": 2258025.0, "reward": 0.2782451882958412, "reward_std": 0.2655297741293907, "rewards/code_reward/mean": 0.2782451882958412, "rewards/code_reward/std": 0.2655297741293907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1142.0, "completions/max_terminated_length": 750.5, "completions/mean_length": 635.8125, "completions/mean_terminated_length": 583.9464340209961, "completions/min_length": 439.5, "completions/min_terminated_length": 439.5, "epoch": 0.007275780047572408, "grad_norm": 0.6310220375924765, "kl": 0.07208251953125, "learning_rate": 4.887614073978761e-06, "loss": 0.0001, "num_tokens": 2296595.0, "reward": 0.08172532171010971, "reward_std": 0.08684739097952843, "rewards/code_reward/mean": 0.08172532171010971, "rewards/code_reward/std": 0.08684739866293967, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.25, "completions/max_terminated_length": 660.25, "completions/mean_length": 459.875, "completions/mean_terminated_length": 459.875, "completions/min_length": 291.5, "completions/min_terminated_length": 291.5, "epoch": 0.007387715125227368, "grad_norm": 0.7464732599550883, "kl": 0.10107421875, "learning_rate": 4.883020654901609e-06, "loss": -0.0305, "num_tokens": 2331455.0, "reward": 0.15218659490346909, "reward_std": 0.10367358289659023, "rewards/code_reward/mean": 0.15218659490346909, "rewards/code_reward/std": 0.10367358289659023, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.5, "completions/max_terminated_length": 657.5, "completions/mean_length": 434.59375, "completions/mean_terminated_length": 434.59375, "completions/min_length": 252.75, "completions/min_terminated_length": 252.75, "epoch": 0.007499650202882328, "grad_norm": 0.9414797694529051, "kl": 0.112060546875, "learning_rate": 4.878337738549785e-06, "loss": 0.0316, "num_tokens": 2356986.0, "reward": 0.10024832468479872, "reward_std": 0.18917413474991918, "rewards/code_reward/mean": 0.10024832468479872, "rewards/code_reward/std": 0.18917413474991918, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.25, "completions/max_terminated_length": 927.25, "completions/mean_length": 541.59375, "completions/mean_terminated_length": 541.59375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.007611585280537288, "grad_norm": 0.9428500397053137, "kl": 0.09423828125, "learning_rate": 4.873565521409082e-06, "loss": 0.0535, "num_tokens": 2388693.0, "reward": 0.13753276504576206, "reward_std": 0.11505712405778468, "rewards/code_reward/mean": 0.13753276504576206, "rewards/code_reward/std": 0.11505712429061532, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1034.75, "completions/max_terminated_length": 878.0, "completions/mean_length": 647.96875, "completions/mean_terminated_length": 614.2187652587891, "completions/min_length": 344.75, "completions/min_terminated_length": 344.75, "epoch": 0.007723520358192249, "grad_norm": 0.6413922701473375, "kl": 0.0836181640625, "learning_rate": 4.868704203712173e-06, "loss": 0.0367, "num_tokens": 2430900.0, "reward": 0.02671811357140541, "reward_std": 0.036580079700797796, "rewards/code_reward/mean": 0.02671811357140541, "rewards/code_reward/std": 0.03658008202910423, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.25, "completions/max_terminated_length": 733.25, "completions/mean_length": 464.53125, "completions/mean_terminated_length": 464.53125, "completions/min_length": 264.25, "completions/min_terminated_length": 264.25, "epoch": 0.007835455435847209, "grad_norm": 0.9506305895466796, "kl": 0.12646484375, "learning_rate": 4.86375398943021e-06, "loss": 0.0209, "num_tokens": 2466109.0, "reward": 0.10709889512509108, "reward_std": 0.14360995404422283, "rewards/code_reward/mean": 0.10709889512509108, "rewards/code_reward/std": 0.14360996149480343, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.5, "completions/max_terminated_length": 498.5, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 150.75, "completions/min_terminated_length": 150.75, "epoch": 0.007947390513502168, "grad_norm": 0.9998509369749687, "kl": 0.118896484375, "learning_rate": 4.858715086264274e-06, "loss": 0.078, "num_tokens": 2489565.0, "reward": 0.08216492831707001, "reward_std": 0.06686047837138176, "rewards/code_reward/mean": 0.08216492831707001, "rewards/code_reward/std": 0.06686047837138176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.5, "completions/max_terminated_length": 539.5, "completions/mean_length": 371.84375, "completions/mean_terminated_length": 371.84375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.00805932559115713, "grad_norm": 0.8636085966435983, "kl": 0.1112060546875, "learning_rate": 4.853587705636646e-06, "loss": 0.0659, "num_tokens": 2518496.0, "reward": 0.3326955884695053, "reward_std": 0.1749400496482849, "rewards/code_reward/mean": 0.3326955884695053, "rewards/code_reward/std": 0.17494005151093006, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.25, "completions/max_terminated_length": 421.25, "completions/mean_length": 293.46875, "completions/mean_terminated_length": 293.46875, "completions/min_length": 154.5, "completions/min_terminated_length": 154.5, "epoch": 0.008171260668812089, "grad_norm": 0.8623456501670774, "kl": 0.1112060546875, "learning_rate": 4.84837206268195e-06, "loss": -0.0657, "num_tokens": 2541951.0, "reward": 0.41282894741743803, "reward_std": 0.14345367066562176, "rewards/code_reward/mean": 0.41282894741743803, "rewards/code_reward/std": 0.14345368463546038, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.75, "completions/max_terminated_length": 546.75, "completions/mean_length": 413.25, "completions/mean_terminated_length": 413.25, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.00828319574646705, "grad_norm": 0.6709675033187479, "kl": 0.1011962890625, "learning_rate": 4.8430683762381195e-06, "loss": 0.0792, "num_tokens": 2571935.0, "reward": 0.09283980540931225, "reward_std": 0.18030504882335663, "rewards/code_reward/mean": 0.09283980540931225, "rewards/code_reward/std": 0.18030504882335663, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.5, "completions/max_terminated_length": 701.5, "completions/mean_length": 471.4375, "completions/mean_terminated_length": 471.4375, "completions/min_length": 277.75, "completions/min_terminated_length": 277.75, "epoch": 0.008395130824122009, "grad_norm": 0.9648853970330805, "kl": 0.1002197265625, "learning_rate": 4.837676868837213e-06, "loss": 0.0101, "num_tokens": 2603869.0, "reward": 0.09730057418346405, "reward_std": 0.08919950015842915, "rewards/code_reward/mean": 0.09730057418346405, "rewards/code_reward/std": 0.08919950760900974, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.5, "completions/max_terminated_length": 705.5, "completions/mean_length": 496.84375, "completions/mean_terminated_length": 496.84375, "completions/min_length": 335.25, "completions/min_terminated_length": 335.25, "epoch": 0.00850706590177697, "grad_norm": 1.0521374237347803, "kl": 0.1251220703125, "learning_rate": 4.832197766696085e-06, "loss": 0.0548, "num_tokens": 2632288.0, "reward": 0.18549207970499992, "reward_std": 0.23195349983870983, "rewards/code_reward/mean": 0.18549207970499992, "rewards/code_reward/std": 0.23195349797606468, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 525.625, "completions/mean_terminated_length": 525.625, "completions/min_length": 347.5, "completions/min_terminated_length": 347.5, "epoch": 0.00861900097943193, "grad_norm": 0.025045228642194366, "kl": 0.1104736328125, "learning_rate": 4.826631299706887e-06, "loss": 0.0011, "num_tokens": 2669812.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.25, "completions/max_terminated_length": 583.25, "completions/mean_length": 442.46875, "completions/mean_terminated_length": 442.46875, "completions/min_length": 208.5, "completions/min_terminated_length": 208.5, "epoch": 0.008730936057086889, "grad_norm": 1.1367736845257632, "kl": 0.1265869140625, "learning_rate": 4.820977701427424e-06, "loss": 0.0331, "num_tokens": 2700491.0, "reward": 0.047237071208655834, "reward_std": 0.10418419446796179, "rewards/code_reward/mean": 0.047237071208655834, "rewards/code_reward/std": 0.10418420331552625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.5, "completions/max_terminated_length": 689.5, "completions/mean_length": 505.6875, "completions/mean_terminated_length": 505.6875, "completions/min_length": 330.25, "completions/min_terminated_length": 330.25, "epoch": 0.00884287113474185, "grad_norm": 0.5193528707901873, "kl": 0.103515625, "learning_rate": 4.81523720907136e-06, "loss": 0.0032, "num_tokens": 2734425.0, "reward": 0.04880136996507645, "reward_std": 0.053204361349344254, "rewards/code_reward/mean": 0.04880136996507645, "rewards/code_reward/std": 0.053204361349344254, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.5, "completions/max_terminated_length": 586.5, "completions/mean_length": 443.75, "completions/mean_terminated_length": 443.75, "completions/min_length": 276.25, "completions/min_terminated_length": 276.25, "epoch": 0.00895480621239681, "grad_norm": 1.0427847884848562, "kl": 0.1318359375, "learning_rate": 4.809410063498254e-06, "loss": 0.0817, "num_tokens": 2765545.0, "reward": 0.2281582325231284, "reward_std": 0.21775340917520225, "rewards/code_reward/mean": 0.2281582325231284, "rewards/code_reward/std": 0.21775341662578285, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1028.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 588.875, "completions/mean_terminated_length": 541.5669708251953, "completions/min_length": 409.75, "completions/min_terminated_length": 409.75, "epoch": 0.00906674129005177, "grad_norm": 0.4518948869491964, "kl": 0.1085205078125, "learning_rate": 4.8034965092034656e-06, "loss": -0.0053, "num_tokens": 2810309.0, "reward": 0.06289062649011612, "reward_std": 0.0699656680226326, "rewards/code_reward/mean": 0.06289062649011612, "rewards/code_reward/std": 0.0699656680226326, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 440.5, "completions/mean_terminated_length": 440.5, "completions/min_length": 277.5, "completions/min_terminated_length": 277.5, "epoch": 0.00917867636770673, "grad_norm": 0.8869624498321675, "kl": 0.1461181640625, "learning_rate": 4.797496794307889e-06, "loss": 0.0534, "num_tokens": 2842813.0, "reward": 0.010840552393347025, "reward_std": 0.02040189504623413, "rewards/code_reward/mean": 0.010840552393347025, "rewards/code_reward/std": 0.02040189504623413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.5, "completions/max_terminated_length": 715.5, "completions/mean_length": 515.71875, "completions/mean_terminated_length": 515.71875, "completions/min_length": 315.5, "completions/min_terminated_length": 315.5, "epoch": 0.009290611445361691, "grad_norm": 0.8955728645359476, "kl": 0.1201171875, "learning_rate": 4.791411170547545e-06, "loss": -0.0048, "num_tokens": 2878988.0, "reward": 0.0949601458851248, "reward_std": 0.043580688536167145, "rewards/code_reward/mean": 0.0949601458851248, "rewards/code_reward/std": 0.043580688536167145, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.75, "completions/max_terminated_length": 712.75, "completions/mean_length": 503.875, "completions/mean_terminated_length": 503.875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.00940254652301665, "grad_norm": 0.5731316506045098, "kl": 0.10693359375, "learning_rate": 4.785239893263017e-06, "loss": 0.0657, "num_tokens": 2916960.0, "reward": 0.055458965012803674, "reward_std": 0.06723660067655146, "rewards/code_reward/mean": 0.055458965012803674, "rewards/code_reward/std": 0.06723660079296678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 815.25, "completions/max_terminated_length": 511.75, "completions/mean_length": 416.78125, "completions/mean_terminated_length": 372.52679443359375, "completions/min_length": 259.5, "completions/min_terminated_length": 259.5, "epoch": 0.00951448160067161, "grad_norm": 1.1451584135613004, "kl": 0.138671875, "learning_rate": 4.778983221388742e-06, "loss": 0.0337, "num_tokens": 2944785.0, "reward": 0.020502878935076296, "reward_std": 0.0165937872370705, "rewards/code_reward/mean": 0.020502878935076296, "rewards/code_reward/std": 0.0165937872370705, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.25, "completions/max_terminated_length": 617.25, "completions/mean_length": 446.9375, "completions/mean_terminated_length": 446.9375, "completions/min_length": 280.5, "completions/min_terminated_length": 280.5, "epoch": 0.00962641667832657, "grad_norm": 0.6695429277037601, "kl": 0.143798828125, "learning_rate": 4.77264141744214e-06, "loss": 0.0076, "num_tokens": 2976815.0, "reward": 0.012867647223174572, "reward_std": 0.027845492586493492, "rewards/code_reward/mean": 0.012867647223174572, "rewards/code_reward/std": 0.02784549444913864, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.75, "completions/max_terminated_length": 511.75, "completions/mean_length": 362.0625, "completions/mean_terminated_length": 362.0625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.00973835175598153, "grad_norm": 1.0671674919636422, "kl": 0.11083984375, "learning_rate": 4.766214747512603e-06, "loss": 0.0937, "num_tokens": 2998881.0, "reward": 0.26682692021131516, "reward_std": 0.16957121342420578, "rewards/code_reward/mean": 0.26682692021131516, "rewards/code_reward/std": 0.16957121714949608, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 465.0, "completions/mean_terminated_length": 465.0, "completions/min_length": 320.25, "completions/min_terminated_length": 320.25, "epoch": 0.009850286833636491, "grad_norm": 1.0133211477792508, "kl": 0.1400146484375, "learning_rate": 4.759703481250331e-06, "loss": 0.0317, "num_tokens": 3034097.0, "reward": 0.07334506892948411, "reward_std": 0.11912691406905651, "rewards/code_reward/mean": 0.07334506892948411, "rewards/code_reward/std": 0.11912691593170166, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 289.90625, "completions/mean_terminated_length": 289.90625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.00996222191129145, "grad_norm": 1.0515861072357229, "kl": 0.1114501953125, "learning_rate": 4.753107891855015e-06, "loss": -0.0234, "num_tokens": 3056046.0, "reward": 0.28452102770097554, "reward_std": 0.07553892768919468, "rewards/code_reward/mean": 0.28452102770097554, "rewards/code_reward/std": 0.07553892862051725, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.75, "completions/max_terminated_length": 519.75, "completions/mean_length": 333.6875, "completions/mean_terminated_length": 333.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.010074156988946412, "grad_norm": 0.8537837657218319, "kl": 0.1558837890625, "learning_rate": 4.746428256064375e-06, "loss": 0.0327, "num_tokens": 3079588.0, "reward": 0.14791105315089226, "reward_std": 0.09374275244772434, "rewards/code_reward/mean": 0.14791105315089226, "rewards/code_reward/std": 0.09374275989830494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.5, "completions/max_terminated_length": 498.5, "completions/mean_length": 349.25, "completions/mean_terminated_length": 349.25, "completions/min_length": 185.5, "completions/min_terminated_length": 185.5, "epoch": 0.010186092066601371, "grad_norm": 1.1628188809572875, "kl": 0.16552734375, "learning_rate": 4.7396648541425534e-06, "loss": 0.071, "num_tokens": 3108756.0, "reward": 0.08873509289696813, "reward_std": 0.11791448388248682, "rewards/code_reward/mean": 0.08873509289696813, "rewards/code_reward/std": 0.11791448295116425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.75, "completions/max_terminated_length": 591.75, "completions/mean_length": 385.59375, "completions/mean_terminated_length": 385.59375, "completions/min_length": 168.5, "completions/min_terminated_length": 168.5, "epoch": 0.010298027144256332, "grad_norm": 1.0836904121924078, "kl": 0.140380859375, "learning_rate": 4.732817969868348e-06, "loss": -0.0549, "num_tokens": 3141759.0, "reward": 0.1353156054392457, "reward_std": 0.0778092760592699, "rewards/code_reward/mean": 0.1353156054392457, "rewards/code_reward/std": 0.07780927885323763, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 406.0, "completions/mean_terminated_length": 406.0, "completions/min_length": 236.75, "completions/min_terminated_length": 236.75, "epoch": 0.010409962221911291, "grad_norm": 0.9641885111816164, "kl": 0.142333984375, "learning_rate": 4.7258878905233095e-06, "loss": -0.0439, "num_tokens": 3167615.0, "reward": 0.22870281734503806, "reward_std": 0.08901303343009204, "rewards/code_reward/mean": 0.22870281734503806, "rewards/code_reward/std": 0.08901303354650736, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.25, "completions/max_terminated_length": 550.25, "completions/mean_length": 400.0, "completions/mean_terminated_length": 400.0, "completions/min_length": 247.5, "completions/min_terminated_length": 247.5, "epoch": 0.01052189729956625, "grad_norm": 0.6901557941958711, "kl": 0.139892578125, "learning_rate": 4.718874906879688e-06, "loss": 0.0551, "num_tokens": 3198319.0, "reward": 0.05615717824548483, "reward_std": 0.09094760753214359, "rewards/code_reward/mean": 0.05615717824548483, "rewards/code_reward/std": 0.09094761684536934, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 405.3125, "completions/mean_terminated_length": 405.3125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.010633832377221212, "grad_norm": 1.1099151469644146, "kl": 0.1552734375, "learning_rate": 4.711779313188231e-06, "loss": 0.0123, "num_tokens": 3232577.0, "reward": 0.1259501683525741, "reward_std": 0.10068924725055695, "rewards/code_reward/mean": 0.1259501683525741, "rewards/code_reward/std": 0.10068925376981497, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.75, "completions/max_terminated_length": 447.75, "completions/mean_length": 318.53125, "completions/mean_terminated_length": 318.53125, "completions/min_length": 214.25, "completions/min_terminated_length": 214.25, "epoch": 0.010745767454876171, "grad_norm": 0.9797800882072801, "kl": 0.160888671875, "learning_rate": 4.70460140716584e-06, "loss": 0.0019, "num_tokens": 3260770.0, "reward": 0.15165849681943655, "reward_std": 0.020956640131771564, "rewards/code_reward/mean": 0.15165849681943655, "rewards/code_reward/std": 0.020956639666110277, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.25, "completions/max_terminated_length": 615.25, "completions/mean_length": 405.21875, "completions/mean_terminated_length": 405.21875, "completions/min_length": 257.5, "completions/min_terminated_length": 257.5, "epoch": 0.010857702532531132, "grad_norm": 0.8843772526949033, "kl": 0.1278076171875, "learning_rate": 4.697341489983076e-06, "loss": -0.0262, "num_tokens": 3292233.0, "reward": 0.22608212963677943, "reward_std": 0.17254789546132088, "rewards/code_reward/mean": 0.22608212963677943, "rewards/code_reward/std": 0.17254790337756276, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 432.28125, "completions/mean_terminated_length": 432.28125, "completions/min_length": 277.75, "completions/min_terminated_length": 277.75, "epoch": 0.010969637610186092, "grad_norm": 0.7844902602771369, "kl": 0.14306640625, "learning_rate": 4.6899998662515215e-06, "loss": -0.0089, "num_tokens": 3325538.0, "reward": 0.034402412828058004, "reward_std": 0.08192514721304178, "rewards/code_reward/mean": 0.034402412828058004, "rewards/code_reward/std": 0.08192514767870307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.25, "completions/max_terminated_length": 498.25, "completions/mean_length": 350.1875, "completions/mean_terminated_length": 350.1875, "completions/min_length": 214.5, "completions/min_terminated_length": 214.5, "epoch": 0.011081572687841053, "grad_norm": 1.053730221138067, "kl": 0.1533203125, "learning_rate": 4.682576844011007e-06, "loss": 0.011, "num_tokens": 3360072.0, "reward": 0.03388687747064978, "reward_std": 0.09327089437283576, "rewards/code_reward/mean": 0.03388687747064978, "rewards/code_reward/std": 0.09327089437283576, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.5, "completions/max_terminated_length": 522.5, "completions/mean_length": 355.53125, "completions/mean_terminated_length": 355.53125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.011193507765496012, "grad_norm": 1.1649821726930594, "kl": 0.138671875, "learning_rate": 4.675072734716678e-06, "loss": 0.0495, "num_tokens": 3386449.0, "reward": 0.2362132353009656, "reward_std": 0.2431453033350408, "rewards/code_reward/mean": 0.2362132353009656, "rewards/code_reward/std": 0.2431453038007021, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.25, "completions/max_terminated_length": 550.25, "completions/mean_length": 408.90625, "completions/mean_terminated_length": 408.90625, "completions/min_length": 278.25, "completions/min_terminated_length": 278.25, "epoch": 0.011305442843150973, "grad_norm": 1.0129951041942997, "kl": 0.1279296875, "learning_rate": 4.667487853225931e-06, "loss": 0.0108, "num_tokens": 3415838.0, "reward": 0.11401251330971718, "reward_std": 0.17430034466087818, "rewards/code_reward/mean": 0.11401251330971718, "rewards/code_reward/std": 0.17430034838616848, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.5, "completions/max_terminated_length": 497.5, "completions/mean_length": 321.125, "completions/mean_terminated_length": 321.125, "completions/min_length": 112.75, "completions/min_terminated_length": 112.75, "epoch": 0.011417377920805933, "grad_norm": 0.046761590657522806, "kl": 0.1375732421875, "learning_rate": 4.659822517785203e-06, "loss": 0.0014, "num_tokens": 3437298.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1112.5, "completions/max_terminated_length": 826.0, "completions/mean_length": 582.5, "completions/mean_terminated_length": 539.5535736083984, "completions/min_length": 330.75, "completions/min_terminated_length": 330.75, "epoch": 0.011529312998460892, "grad_norm": 0.936453543752835, "kl": 0.1123046875, "learning_rate": 4.6520770500166165e-06, "loss": -0.0705, "num_tokens": 3476618.0, "reward": 0.11124547757208347, "reward_std": 0.17285121232271194, "rewards/code_reward/mean": 0.11124547757208347, "rewards/code_reward/std": 0.17285121977329254, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 329.78125, "completions/mean_terminated_length": 329.78125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.011641248076115853, "grad_norm": 0.9390473948263904, "kl": 0.153564453125, "learning_rate": 4.644251774904487e-06, "loss": -0.1439, "num_tokens": 3504803.0, "reward": 0.15121639240533113, "reward_std": 0.10038218321278691, "rewards/code_reward/mean": 0.15121639240533113, "rewards/code_reward/std": 0.10038218321278691, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 424.125, "completions/mean_terminated_length": 424.125, "completions/min_length": 292.75, "completions/min_terminated_length": 292.75, "epoch": 0.011753183153770812, "grad_norm": 1.2049832685071231, "kl": 0.167236328125, "learning_rate": 4.636347020781684e-06, "loss": -0.0759, "num_tokens": 3539471.0, "reward": 0.07833968009799719, "reward_std": 0.13395367993507534, "rewards/code_reward/mean": 0.07833968009799719, "rewards/code_reward/std": 0.13395368051715195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 419.71875, "completions/mean_terminated_length": 419.71875, "completions/min_length": 227.5, "completions/min_terminated_length": 227.5, "epoch": 0.011865118231425774, "grad_norm": 0.8526628663219907, "kl": 0.146240234375, "learning_rate": 4.6283631193158605e-06, "loss": 0.039, "num_tokens": 3576414.0, "reward": 0.12546709179878235, "reward_std": 0.13694094121456146, "rewards/code_reward/mean": 0.12546709179878235, "rewards/code_reward/std": 0.13694094866514206, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.5, "completions/max_terminated_length": 574.5, "completions/mean_length": 430.21875, "completions/mean_terminated_length": 430.21875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.011977053309080733, "grad_norm": 1.1258481541663952, "kl": 0.1580810546875, "learning_rate": 4.620300405495532e-06, "loss": -0.0443, "num_tokens": 3613485.0, "reward": 0.11382943368516862, "reward_std": 0.1571302842348814, "rewards/code_reward/mean": 0.11382943368516862, "rewards/code_reward/std": 0.1571302842348814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 514.90625, "completions/mean_terminated_length": 514.90625, "completions/min_length": 229.75, "completions/min_terminated_length": 229.75, "epoch": 0.012088988386735694, "grad_norm": 0.8215610976673118, "kl": 0.1373291015625, "learning_rate": 4.612159217616022e-06, "loss": -0.0201, "num_tokens": 3648370.0, "reward": 0.2085580751299858, "reward_std": 0.1858917400240898, "rewards/code_reward/mean": 0.2085580751299858, "rewards/code_reward/std": 0.185891754925251, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.25, "completions/max_terminated_length": 700.25, "completions/mean_length": 498.0625, "completions/mean_terminated_length": 498.0625, "completions/min_length": 314.25, "completions/min_terminated_length": 314.25, "epoch": 0.012200923464390653, "grad_norm": 1.0580602071827545, "kl": 0.15966796875, "learning_rate": 4.603939897265268e-06, "loss": 0.0414, "num_tokens": 3684604.0, "reward": 0.08671755698742345, "reward_std": 0.06942056433763355, "rewards/code_reward/mean": 0.08671755698742345, "rewards/code_reward/std": 0.06942057458218187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1063.25, "completions/max_terminated_length": 766.75, "completions/mean_length": 572.75, "completions/mean_terminated_length": 522.6160736083984, "completions/min_length": 313.75, "completions/min_terminated_length": 313.75, "epoch": 0.012312858542045613, "grad_norm": 0.6888409929879039, "kl": 0.118896484375, "learning_rate": 4.595642789309492e-06, "loss": -0.246, "num_tokens": 3720668.0, "reward": 0.10784313827753067, "reward_std": 0.164918415248394, "rewards/code_reward/mean": 0.10784313827753067, "rewards/code_reward/std": 0.164918415248394, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.5, "completions/max_terminated_length": 660.5, "completions/mean_length": 428.15625, "completions/mean_terminated_length": 428.15625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.012424793619700574, "grad_norm": 0.6151737403026374, "kl": 0.153564453125, "learning_rate": 4.587268241878724e-06, "loss": 0.0941, "num_tokens": 3757241.0, "reward": 0.0126953125, "reward_std": 0.03590776585042477, "rewards/code_reward/mean": 0.0126953125, "rewards/code_reward/std": 0.03590776678174734, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.25, "completions/max_terminated_length": 644.25, "completions/mean_length": 468.65625, "completions/mean_terminated_length": 468.65625, "completions/min_length": 265.25, "completions/min_terminated_length": 265.25, "epoch": 0.012536728697355533, "grad_norm": 0.8581479811610659, "kl": 0.162841796875, "learning_rate": 4.578816606352205e-06, "loss": -0.0116, "num_tokens": 3786094.0, "reward": 0.10341486724792048, "reward_std": 0.09727730182930827, "rewards/code_reward/mean": 0.10341486724792048, "rewards/code_reward/std": 0.09727730927988887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.75, "completions/max_terminated_length": 602.75, "completions/mean_length": 443.21875, "completions/mean_terminated_length": 443.21875, "completions/min_length": 278.25, "completions/min_terminated_length": 278.25, "epoch": 0.012648663775010494, "grad_norm": 1.032518001055115, "kl": 0.150634765625, "learning_rate": 4.570288237343632e-06, "loss": 0.0057, "num_tokens": 3815197.0, "reward": 0.19150842766975984, "reward_std": 0.17536822147667408, "rewards/code_reward/mean": 0.19150842766975984, "rewards/code_reward/std": 0.17536823637783527, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 471.78125, "completions/mean_terminated_length": 471.78125, "completions/min_length": 229.75, "completions/min_terminated_length": 229.75, "epoch": 0.012760598852665454, "grad_norm": 0.7883185431993377, "kl": 0.154296875, "learning_rate": 4.561683492686289e-06, "loss": -0.0316, "num_tokens": 3849462.0, "reward": 0.15483782812952995, "reward_std": 0.09077820833772421, "rewards/code_reward/mean": 0.15483782812952995, "rewards/code_reward/std": 0.09077820833772421, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.5, "completions/max_terminated_length": 707.5, "completions/mean_length": 486.5, "completions/mean_terminated_length": 486.5, "completions/min_length": 265.5, "completions/min_terminated_length": 265.5, "epoch": 0.012872533930320415, "grad_norm": 0.8556740089529604, "kl": 0.157958984375, "learning_rate": 4.5530027334180285e-06, "loss": 0.117, "num_tokens": 3887790.0, "reward": 0.127931407361757, "reward_std": 0.11615415895357728, "rewards/code_reward/mean": 0.127931407361757, "rewards/code_reward/std": 0.1161541665205732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 393.375, "completions/mean_terminated_length": 393.375, "completions/min_length": 257.25, "completions/min_terminated_length": 257.25, "epoch": 0.012984469007975374, "grad_norm": 0.7897406376014194, "kl": 0.19970703125, "learning_rate": 4.544246323766122e-06, "loss": 0.0248, "num_tokens": 3919554.0, "reward": 0.3160191457718611, "reward_std": 0.041524797677993774, "rewards/code_reward/mean": 0.3160191457718611, "rewards/code_reward/std": 0.041524799540638924, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.25, "completions/max_terminated_length": 539.25, "completions/mean_length": 380.90625, "completions/mean_terminated_length": 380.90625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.013096404085630335, "grad_norm": 1.1134080298302058, "kl": 0.165771484375, "learning_rate": 4.535414631131983e-06, "loss": -0.0078, "num_tokens": 3944911.0, "reward": 0.24838980130152777, "reward_std": 0.09577816107776016, "rewards/code_reward/mean": 0.24838980130152777, "rewards/code_reward/std": 0.09577816678211093, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.5, "completions/max_terminated_length": 570.5, "completions/mean_length": 429.4375, "completions/mean_terminated_length": 429.4375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.013208339163285295, "grad_norm": 0.9495608914425865, "kl": 0.1414794921875, "learning_rate": 4.526508026075746e-06, "loss": 0.0082, "num_tokens": 3972925.0, "reward": 0.14121240563690662, "reward_std": 0.18964817747473717, "rewards/code_reward/mean": 0.14121240563690662, "rewards/code_reward/std": 0.18964817561209202, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.5, "completions/max_terminated_length": 786.5, "completions/mean_length": 586.34375, "completions/mean_terminated_length": 586.34375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.013320274240940254, "grad_norm": 0.02608214090069118, "kl": 0.1209716796875, "learning_rate": 4.517526882300721e-06, "loss": 0.0012, "num_tokens": 4010480.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_reward/mean": 0.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 434.34375, "completions/mean_terminated_length": 434.34375, "completions/min_length": 315.25, "completions/min_terminated_length": 315.25, "epoch": 0.013432209318595215, "grad_norm": 1.1103364222586907, "kl": 0.16455078125, "learning_rate": 4.508471576637713e-06, "loss": 0.0019, "num_tokens": 4047539.0, "reward": 0.11785737407626584, "reward_std": 0.09593676403164864, "rewards/code_reward/mean": 0.11785737407626584, "rewards/code_reward/std": 0.09593676414806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.5, "completions/max_terminated_length": 587.5, "completions/mean_length": 426.9375, "completions/mean_terminated_length": 426.9375, "completions/min_length": 296.25, "completions/min_terminated_length": 296.25, "epoch": 0.013544144396250174, "grad_norm": 0.9365073082574228, "kl": 0.1627197265625, "learning_rate": 4.499342489029211e-06, "loss": -0.0644, "num_tokens": 4073449.0, "reward": 0.25551173387793824, "reward_std": 0.1488891058252193, "rewards/code_reward/mean": 0.25551173387793824, "rewards/code_reward/std": 0.1488891058252193, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.5, "completions/max_terminated_length": 777.5, "completions/mean_length": 561.3125, "completions/mean_terminated_length": 561.3125, "completions/min_length": 308.5, "completions/min_terminated_length": 308.5, "epoch": 0.013656079473905135, "grad_norm": 0.9171469588767179, "kl": 0.16162109375, "learning_rate": 4.490140002513449e-06, "loss": 0.0833, "num_tokens": 4117531.0, "reward": 0.06771073397248983, "reward_std": 0.10771879553794861, "rewards/code_reward/mean": 0.06771073397248983, "rewards/code_reward/std": 0.1077187992632389, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 602.0, "completions/mean_terminated_length": 602.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.013768014551560095, "grad_norm": 0.5430581678238743, "kl": 0.1229248046875, "learning_rate": 4.48086450320833e-06, "loss": 0.0193, "num_tokens": 4164115.0, "reward": 0.06402191519737244, "reward_std": 0.10564571619033813, "rewards/code_reward/mean": 0.06402191519737244, "rewards/code_reward/std": 0.10564571805298328, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.75, "completions/max_terminated_length": 700.75, "completions/mean_length": 464.5, "completions/mean_terminated_length": 464.5, "completions/min_length": 276.25, "completions/min_terminated_length": 276.25, "epoch": 0.013879949629215056, "grad_norm": 0.9177689772341072, "kl": 0.154296875, "learning_rate": 4.4715163802952266e-06, "loss": 0.0239, "num_tokens": 4192843.0, "reward": 0.1863182729575783, "reward_std": 0.09033735934644938, "rewards/code_reward/mean": 0.1863182729575783, "rewards/code_reward/std": 0.09033736307173967, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.5, "completions/max_terminated_length": 752.5, "completions/mean_length": 489.78125, "completions/mean_terminated_length": 489.78125, "completions/min_length": 206.25, "completions/min_terminated_length": 206.25, "epoch": 0.013991884706870015, "grad_norm": 0.8322506450321084, "kl": 0.202392578125, "learning_rate": 4.462096026002655e-06, "loss": 0.0184, "num_tokens": 4227268.0, "reward": 0.21419981867074966, "reward_std": 0.21183521673083305, "rewards/code_reward/mean": 0.21419981867074966, "rewards/code_reward/std": 0.21183520928025246, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 540.03125, "completions/mean_terminated_length": 540.03125, "completions/min_length": 284.25, "completions/min_terminated_length": 284.25, "epoch": 0.014103819784524976, "grad_norm": 0.9689144701432464, "kl": 0.146728515625, "learning_rate": 4.4526038355898144e-06, "loss": -0.0308, "num_tokens": 4261797.0, "reward": 0.261167012155056, "reward_std": 0.22630748711526394, "rewards/code_reward/mean": 0.261167012155056, "rewards/code_reward/std": 0.22630748711526394, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.75, "completions/max_terminated_length": 673.75, "completions/mean_length": 450.34375, "completions/mean_terminated_length": 450.34375, "completions/min_length": 210.25, "completions/min_terminated_length": 210.25, "epoch": 0.014215754862179936, "grad_norm": 0.4213790833134132, "kl": 0.144287109375, "learning_rate": 4.4430402073300035e-06, "loss": 0.0292, "num_tokens": 4290992.0, "reward": 0.012987012974917889, "reward_std": 0.013883699662983418, "rewards/code_reward/mean": 0.012987012974917889, "rewards/code_reward/std": 0.013883701525628567, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.25, "completions/max_terminated_length": 664.25, "completions/mean_length": 453.6875, "completions/mean_terminated_length": 453.6875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.014327689939834895, "grad_norm": 0.8712327296844586, "kl": 0.145263671875, "learning_rate": 4.433405542493909e-06, "loss": -0.0154, "num_tokens": 4323358.0, "reward": 0.12838431354612112, "reward_std": 0.14957262016832829, "rewards/code_reward/mean": 0.12838431354612112, "rewards/code_reward/std": 0.14957262203097343, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.75, "completions/max_terminated_length": 840.75, "completions/mean_length": 554.40625, "completions/mean_terminated_length": 554.40625, "completions/min_length": 317.5, "completions/min_terminated_length": 317.5, "epoch": 0.014439625017489856, "grad_norm": 0.8824605681531097, "kl": 0.15185546875, "learning_rate": 4.4237002453327734e-06, "loss": 0.096, "num_tokens": 4357363.0, "reward": 0.22759733814746141, "reward_std": 0.2646235190331936, "rewards/code_reward/mean": 0.22759733814746141, "rewards/code_reward/std": 0.26462352089583874, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.75, "completions/max_terminated_length": 601.75, "completions/mean_length": 434.46875, "completions/mean_terminated_length": 434.46875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.014551560095144815, "grad_norm": 1.011223461250626, "kl": 0.16943359375, "learning_rate": 4.4139247230614245e-06, "loss": 0.0213, "num_tokens": 4390298.0, "reward": 0.17828914523124695, "reward_std": 0.2471884172409773, "rewards/code_reward/mean": 0.17828914523124695, "rewards/code_reward/std": 0.24718842469155788, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.25, "completions/max_terminated_length": 502.25, "completions/mean_length": 342.0625, "completions/mean_terminated_length": 342.0625, "completions/min_length": 229.75, "completions/min_terminated_length": 229.75, "epoch": 0.014663495172799777, "grad_norm": 0.7287897396776124, "kl": 0.223876953125, "learning_rate": 4.404079385841201e-06, "loss": -0.0213, "num_tokens": 4411124.0, "reward": 0.599999999627471, "reward_std": 0.13620114093646407, "rewards/code_reward/mean": 0.599999999627471, "rewards/code_reward/std": 0.13620115583762527, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 406.1875, "completions/mean_terminated_length": 406.1875, "completions/min_length": 241.5, "completions/min_terminated_length": 241.5, "epoch": 0.014775430250454736, "grad_norm": 1.1828069225002862, "kl": 0.21142578125, "learning_rate": 4.394164646762734e-06, "loss": 0.0079, "num_tokens": 4436370.0, "reward": 0.070248453237582, "reward_std": 0.07654083496890962, "rewards/code_reward/mean": 0.070248453237582, "rewards/code_reward/std": 0.07654083543457091, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.75, "completions/max_terminated_length": 777.75, "completions/mean_length": 456.53125, "completions/mean_terminated_length": 456.53125, "completions/min_length": 206.25, "completions/min_terminated_length": 206.25, "epoch": 0.014887365328109697, "grad_norm": 0.8073699112875448, "kl": 0.1446533203125, "learning_rate": 4.384180921828618e-06, "loss": 0.0692, "num_tokens": 4466595.0, "reward": 0.17728960141539574, "reward_std": 0.20221376791596413, "rewards/code_reward/mean": 0.17728960141539574, "rewards/code_reward/std": 0.20221376977860928, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.5, "completions/max_terminated_length": 632.5, "completions/mean_length": 455.34375, "completions/mean_terminated_length": 455.34375, "completions/min_length": 254.75, "completions/min_terminated_length": 254.75, "epoch": 0.014999300405764656, "grad_norm": 0.8462919317415648, "kl": 0.156494140625, "learning_rate": 4.374128629935955e-06, "loss": 0.0137, "num_tokens": 4500494.0, "reward": 0.1631067901616916, "reward_std": 0.13719243195373565, "rewards/code_reward/mean": 0.1631067901616916, "rewards/code_reward/std": 0.13719243567902595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.75, "completions/max_terminated_length": 654.75, "completions/mean_length": 447.09375, "completions/mean_terminated_length": 447.09375, "completions/min_length": 267.75, "completions/min_terminated_length": 267.75, "epoch": 0.015111235483419617, "grad_norm": 1.0654308580054503, "kl": 0.18505859375, "learning_rate": 4.364008192858781e-06, "loss": -0.0584, "num_tokens": 4531953.0, "reward": 0.30278054997324944, "reward_std": 0.2559507302939892, "rewards/code_reward/mean": 0.30278054997324944, "rewards/code_reward/std": 0.2559507489204407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 524.3125, "completions/mean_terminated_length": 524.3125, "completions/min_length": 310.25, "completions/min_terminated_length": 310.25, "epoch": 0.015223170561074577, "grad_norm": 0.7375182423747296, "kl": 0.1689453125, "learning_rate": 4.353820035230366e-06, "loss": -0.0053, "num_tokens": 4570779.0, "reward": 0.27923886105418205, "reward_std": 0.0807168073952198, "rewards/code_reward/mean": 0.27923886105418205, "rewards/code_reward/std": 0.0807168073952198, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 954.0, "completions/max_terminated_length": 635.25, "completions/mean_length": 518.625, "completions/mean_terminated_length": 471.20982360839844, "completions/min_length": 182.75, "completions/min_terminated_length": 182.75, "epoch": 0.015335105638729536, "grad_norm": 0.7408425833924634, "kl": 0.128662109375, "learning_rate": 4.3435645845254e-06, "loss": -0.0565, "num_tokens": 4603031.0, "reward": 0.08707524091005325, "reward_std": 0.1465706154704094, "rewards/code_reward/mean": 0.08707524091005325, "rewards/code_reward/std": 0.1465706117451191, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.75, "completions/max_terminated_length": 649.75, "completions/mean_length": 465.34375, "completions/mean_terminated_length": 465.34375, "completions/min_length": 238.25, "completions/min_terminated_length": 238.25, "epoch": 0.015447040716384497, "grad_norm": 0.9226530191775734, "kl": 0.196533203125, "learning_rate": 4.333242271042054e-06, "loss": 0.0199, "num_tokens": 4640226.0, "reward": 0.12062139442423359, "reward_std": 0.12237106915563345, "rewards/code_reward/mean": 0.12062139442423359, "rewards/code_reward/std": 0.1223710693884641, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 463.03125, "completions/mean_terminated_length": 463.03125, "completions/min_length": 170.5, "completions/min_terminated_length": 170.5, "epoch": 0.015558975794039457, "grad_norm": 0.5630743662662061, "kl": 0.133544921875, "learning_rate": 4.32285352788393e-06, "loss": -0.0273, "num_tokens": 4672011.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/code_reward/mean": 0.0625, "rewards/code_reward/std": 0.06681530922651291, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.25, "completions/max_terminated_length": 667.25, "completions/mean_length": 473.21875, "completions/mean_terminated_length": 473.21875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.015670910871694418, "grad_norm": 0.8457340493749413, "kl": 0.203125, "learning_rate": 4.312398790941882e-06, "loss": 0.0252, "num_tokens": 4707650.0, "reward": 0.01744219067040831, "reward_std": 0.03082139673642814, "rewards/code_reward/mean": 0.01744219067040831, "rewards/code_reward/std": 0.030821396969258785, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 459.03125, "completions/mean_terminated_length": 459.03125, "completions/min_length": 282.75, "completions/min_terminated_length": 282.75, "epoch": 0.015782845949349377, "grad_norm": 1.0215167669033631, "kl": 0.1568603515625, "learning_rate": 4.301878498875735e-06, "loss": -0.0223, "num_tokens": 4738659.0, "reward": 0.14855818077921867, "reward_std": 0.18304241634905338, "rewards/code_reward/mean": 0.14855818077921867, "rewards/code_reward/std": 0.18304241262376308, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.75, "completions/max_terminated_length": 573.75, "completions/mean_length": 438.84375, "completions/mean_terminated_length": 438.84375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.015894781027004336, "grad_norm": 0.958766664227721, "kl": 0.20068359375, "learning_rate": 4.291293093095873e-06, "loss": 0.0597, "num_tokens": 4769838.0, "reward": 0.0944940485060215, "reward_std": 0.07186714326962829, "rewards/code_reward/mean": 0.0944940485060215, "rewards/code_reward/std": 0.07186715072020888, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.25, "completions/max_terminated_length": 705.25, "completions/mean_length": 510.0625, "completions/mean_terminated_length": 510.0625, "completions/min_length": 270.25, "completions/min_terminated_length": 270.25, "epoch": 0.0160067161046593, "grad_norm": 0.8340511277589133, "kl": 0.191650390625, "learning_rate": 4.280643017744723e-06, "loss": -0.0546, "num_tokens": 4813416.0, "reward": 0.017440817784518003, "reward_std": 0.015970090869814157, "rewards/code_reward/mean": 0.017440817784518003, "rewards/code_reward/std": 0.015970090869814157, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.5, "completions/max_terminated_length": 676.5, "completions/mean_length": 436.625, "completions/mean_terminated_length": 436.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.01611865118231426, "grad_norm": 0.9354512924987337, "kl": 0.1859130859375, "learning_rate": 4.269928719678117e-06, "loss": 0.0158, "num_tokens": 4850540.0, "reward": 0.18274498358368874, "reward_std": 0.1233069859445095, "rewards/code_reward/mean": 0.18274498358368874, "rewards/code_reward/std": 0.1233069896697998, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 417.8125, "completions/mean_terminated_length": 417.8125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.016230586259969218, "grad_norm": 0.8775436859559402, "kl": 0.200927734375, "learning_rate": 4.2591506484465426e-06, "loss": 0.06, "num_tokens": 4880958.0, "reward": 0.1889239656738937, "reward_std": 0.06604543374851346, "rewards/code_reward/mean": 0.1889239656738937, "rewards/code_reward/std": 0.06604543328285217, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.5, "completions/max_terminated_length": 642.5, "completions/mean_length": 453.6875, "completions/mean_terminated_length": 453.6875, "completions/min_length": 225.75, "completions/min_terminated_length": 225.75, "epoch": 0.016342521337624177, "grad_norm": 1.0579923330835856, "kl": 0.190673828125, "learning_rate": 4.248309256276283e-06, "loss": 0.0058, "num_tokens": 4908772.0, "reward": 0.22657467075623572, "reward_std": 0.27265046804677695, "rewards/code_reward/mean": 0.22657467075623572, "rewards/code_reward/std": 0.27265046804677695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 373.4375, "completions/mean_terminated_length": 373.4375, "completions/min_length": 167.75, "completions/min_terminated_length": 167.75, "epoch": 0.016454456415279137, "grad_norm": 1.2602004582489772, "kl": 0.2349853515625, "learning_rate": 4.23740499805044e-06, "loss": 0.0749, "num_tokens": 4935178.0, "reward": 0.3513445816934109, "reward_std": 0.20541435480117798, "rewards/code_reward/mean": 0.3513445816934109, "rewards/code_reward/std": 0.20541436225175858, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.25, "completions/max_terminated_length": 619.25, "completions/mean_length": 427.78125, "completions/mean_terminated_length": 427.78125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.0165663914929341, "grad_norm": 1.1160890350070682, "kl": 0.17919921875, "learning_rate": 4.22643833128985e-06, "loss": 0.0269, "num_tokens": 4966539.0, "reward": 0.279205069411546, "reward_std": 0.04902365058660507, "rewards/code_reward/mean": 0.279205069411546, "rewards/code_reward/std": 0.04902365151792765, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.25, "completions/max_terminated_length": 665.25, "completions/mean_length": 371.625, "completions/mean_terminated_length": 371.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.01667832657058906, "grad_norm": 1.1750545317228818, "kl": 0.23681640625, "learning_rate": 4.215409716133885e-06, "loss": 0.015, "num_tokens": 5001903.0, "reward": 0.17107138480059803, "reward_std": 0.16521674406249076, "rewards/code_reward/mean": 0.17107138480059803, "rewards/code_reward/std": 0.16521676117554307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.25, "completions/max_terminated_length": 576.25, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "epoch": 0.016790261648244018, "grad_norm": 1.0413783228416158, "kl": 0.249755859375, "learning_rate": 4.204319615321151e-06, "loss": 0.0077, "num_tokens": 5030091.0, "reward": 0.09492883179336786, "reward_std": 0.12909611221402884, "rewards/code_reward/mean": 0.09492883179336786, "rewards/code_reward/std": 0.12909611966460943, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.75, "completions/max_terminated_length": 585.75, "completions/mean_length": 353.1875, "completions/mean_terminated_length": 353.1875, "completions/min_length": 91.25, "completions/min_terminated_length": 91.25, "epoch": 0.016902196725898978, "grad_norm": 1.579544739950842, "kl": 0.50390625, "learning_rate": 4.193168494170065e-06, "loss": 0.0444, "num_tokens": 5057441.0, "reward": 0.600965291261673, "reward_std": 0.2557707913219929, "rewards/code_reward/mean": 0.600965291261673, "rewards/code_reward/std": 0.2557708006352186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.25, "completions/max_terminated_length": 577.25, "completions/mean_length": 340.625, "completions/mean_terminated_length": 340.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.01701413180355394, "grad_norm": 1.3003579285788192, "kl": 0.190673828125, "learning_rate": 4.181956820559339e-06, "loss": 0.132, "num_tokens": 5082069.0, "reward": 0.32964441180229187, "reward_std": 0.2922050729393959, "rewards/code_reward/mean": 0.32964441180229187, "rewards/code_reward/std": 0.2922050729393959, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.75, "completions/max_terminated_length": 463.75, "completions/mean_length": 249.5625, "completions/mean_terminated_length": 249.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.0171260668812089, "grad_norm": 1.2964116512514992, "kl": 0.23046875, "learning_rate": 4.170685064908342e-06, "loss": 0.0824, "num_tokens": 5110151.0, "reward": 0.128064907155931, "reward_std": 0.0706186261959374, "rewards/code_reward/mean": 0.128064907155931, "rewards/code_reward/std": 0.07061862386763096, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.75, "completions/max_terminated_length": 499.75, "completions/mean_length": 290.53125, "completions/mean_terminated_length": 290.53125, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "epoch": 0.01723800195886386, "grad_norm": 0.8703199526169038, "kl": 0.276611328125, "learning_rate": 4.159353700157365e-06, "loss": -0.0831, "num_tokens": 5137592.0, "reward": 0.11129332333803177, "reward_std": 0.10705379582941532, "rewards/code_reward/mean": 0.11129332333803177, "rewards/code_reward/std": 0.1070537967607379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 119.75, "completions/min_terminated_length": 119.75, "epoch": 0.01734993703651882, "grad_norm": 1.224937538572504, "kl": 0.201416015625, "learning_rate": 4.14796320174778e-06, "loss": -0.0439, "num_tokens": 5162960.0, "reward": 0.1461925357580185, "reward_std": 0.23236336186528206, "rewards/code_reward/mean": 0.1461925357580185, "rewards/code_reward/std": 0.2323633674532175, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 347.59375, "completions/mean_terminated_length": 347.59375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.017461872114173778, "grad_norm": 0.6766785249872507, "kl": 0.1558837890625, "learning_rate": 4.136514047602087e-06, "loss": 0.0103, "num_tokens": 5192755.0, "reward": 0.0625, "reward_std": 0.1157275140285492, "rewards/code_reward/mean": 0.0625, "rewards/code_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.5, "completions/max_terminated_length": 517.5, "completions/mean_length": 301.0625, "completions/mean_terminated_length": 301.0625, "completions/min_length": 112.25, "completions/min_terminated_length": 112.25, "epoch": 0.01757380719182874, "grad_norm": 1.2551889406028631, "kl": 0.197998046875, "learning_rate": 4.1250067181038635e-06, "loss": -0.0209, "num_tokens": 5216477.0, "reward": 0.17783564236015081, "reward_std": 0.24008767772465944, "rewards/code_reward/mean": 0.17783564236015081, "rewards/code_reward/std": 0.24008767493069172, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 232.6875, "completions/mean_terminated_length": 232.6875, "completions/min_length": 86.25, "completions/min_terminated_length": 86.25, "epoch": 0.0176857422694837, "grad_norm": 1.4488039878157586, "kl": 0.1767578125, "learning_rate": 4.113441696077608e-06, "loss": -0.0524, "num_tokens": 5237427.0, "reward": 0.03743714070878923, "reward_std": 0.10588822257705033, "rewards/code_reward/mean": 0.03743714070878923, "rewards/code_reward/std": 0.10588822374120355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.25, "completions/max_terminated_length": 477.25, "completions/mean_length": 279.34375, "completions/mean_terminated_length": 279.34375, "completions/min_length": 133.25, "completions/min_terminated_length": 133.25, "epoch": 0.01779767734713866, "grad_norm": 1.5189669762242, "kl": 0.238037109375, "learning_rate": 4.101819466768484e-06, "loss": -0.1518, "num_tokens": 5268406.0, "reward": 0.08647377614397556, "reward_std": 0.06177530816057697, "rewards/code_reward/mean": 0.08647377614397556, "rewards/code_reward/std": 0.06177531188586727, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.5, "completions/max_terminated_length": 491.5, "completions/mean_length": 260.4375, "completions/mean_terminated_length": 260.4375, "completions/min_length": 71.75, "completions/min_terminated_length": 71.75, "epoch": 0.01790961242479362, "grad_norm": 1.0554249659877584, "kl": 0.147705078125, "learning_rate": 4.0901405178219535e-06, "loss": 0.0005, "num_tokens": 5291300.0, "reward": 0.04570723883807659, "reward_std": 0.07690948667004704, "rewards/code_reward/mean": 0.04570723883807659, "rewards/code_reward/std": 0.07690948317758739, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 341.28125, "completions/mean_terminated_length": 341.28125, "completions/min_length": 155.75, "completions/min_terminated_length": 155.75, "epoch": 0.018021547502448578, "grad_norm": 0.9717735557494784, "kl": 0.212890625, "learning_rate": 4.078405339263326e-06, "loss": -0.0304, "num_tokens": 5321093.0, "reward": 0.053125000558793545, "reward_std": 0.07709404267370701, "rewards/code_reward/mean": 0.053125000558793545, "rewards/code_reward/std": 0.07709404919296503, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.5, "completions/max_terminated_length": 480.5, "completions/mean_length": 237.9375, "completions/mean_terminated_length": 237.9375, "completions/min_length": 116.75, "completions/min_terminated_length": 116.75, "epoch": 0.01813348258010354, "grad_norm": 1.173075355333341, "kl": 0.188232421875, "learning_rate": 4.06661442347719e-06, "loss": -0.0205, "num_tokens": 5348659.0, "reward": 0.2592630833387375, "reward_std": 0.15858712047338486, "rewards/code_reward/mean": 0.2592630833387375, "rewards/code_reward/std": 0.15858712792396545, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.75, "completions/max_terminated_length": 530.75, "completions/mean_length": 297.1875, "completions/mean_terminated_length": 297.1875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.0182454176577585, "grad_norm": 1.287995519073581, "kl": 0.18896484375, "learning_rate": 4.054768265186758e-06, "loss": -0.0652, "num_tokens": 5372217.0, "reward": 0.33238982781767845, "reward_std": 0.27272730600088835, "rewards/code_reward/mean": 0.33238982781767845, "rewards/code_reward/std": 0.2727273255586624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.5, "completions/max_terminated_length": 636.5, "completions/mean_length": 292.53125, "completions/mean_terminated_length": 292.53125, "completions/min_length": 117.25, "completions/min_terminated_length": 117.25, "epoch": 0.01835735273541346, "grad_norm": 1.1060543366676017, "kl": 0.165283203125, "learning_rate": 4.0428673614331036e-06, "loss": 0.0064, "num_tokens": 5397890.0, "reward": 0.20836169831454754, "reward_std": 0.17235604114830494, "rewards/code_reward/mean": 0.20836169831454754, "rewards/code_reward/std": 0.17235605791211128, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.5, "completions/max_terminated_length": 814.5, "completions/mean_length": 351.71875, "completions/mean_terminated_length": 351.71875, "completions/min_length": 145.75, "completions/min_terminated_length": 145.75, "epoch": 0.01846928781306842, "grad_norm": 0.7382591236061559, "kl": 0.1884765625, "learning_rate": 4.030912211554316e-06, "loss": 0.0313, "num_tokens": 5423913.0, "reward": 0.13007790176197886, "reward_std": 0.05658754054456949, "rewards/code_reward/mean": 0.13007790176197886, "rewards/code_reward/std": 0.056587545201182365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.25, "completions/max_terminated_length": 539.25, "completions/mean_length": 299.21875, "completions/mean_terminated_length": 299.21875, "completions/min_length": 131.25, "completions/min_terminated_length": 131.25, "epoch": 0.018581222890723382, "grad_norm": 1.373915385466877, "kl": 0.193359375, "learning_rate": 4.018903317164539e-06, "loss": -0.1003, "num_tokens": 5448488.0, "reward": 0.08751785231288522, "reward_std": 0.11654674645978957, "rewards/code_reward/mean": 0.08751785231288522, "rewards/code_reward/std": 0.11654674645978957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 359.65625, "completions/mean_terminated_length": 359.65625, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "epoch": 0.01869315796837834, "grad_norm": 1.0340989098627629, "kl": 0.17822265625, "learning_rate": 4.006841182132932e-06, "loss": -0.0343, "num_tokens": 5474285.0, "reward": 0.1759367436170578, "reward_std": 0.2240792140364647, "rewards/code_reward/mean": 0.1759367436170578, "rewards/code_reward/std": 0.224079217761755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.75, "completions/max_terminated_length": 461.75, "completions/mean_length": 281.5625, "completions/mean_terminated_length": 281.5625, "completions/min_length": 102.25, "completions/min_terminated_length": 102.25, "epoch": 0.0188050930460333, "grad_norm": 1.4134315706278993, "kl": 0.210205078125, "learning_rate": 3.9947263125625195e-06, "loss": 0.013, "num_tokens": 5498599.0, "reward": 0.3900106647051871, "reward_std": 0.2551127364858985, "rewards/code_reward/mean": 0.3900106647051871, "rewards/code_reward/std": 0.25511275534518063, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1069.5, "completions/max_terminated_length": 671.0, "completions/mean_length": 497.46875, "completions/mean_terminated_length": 441.75, "completions/min_length": 222.75, "completions/min_terminated_length": 222.75, "epoch": 0.01891702812368826, "grad_norm": 1.0149191598734515, "kl": 0.197998046875, "learning_rate": 3.982559216768967e-06, "loss": 0.0765, "num_tokens": 5530310.0, "reward": 0.1429782696068287, "reward_std": 0.16231020726263523, "rewards/code_reward/mean": 0.1429782696068287, "rewards/code_reward/std": 0.16231020539999008, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.5, "completions/max_terminated_length": 601.5, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 166.5, "completions/min_terminated_length": 166.5, "epoch": 0.01902896320134322, "grad_norm": 0.8227739015604961, "kl": 0.18212890625, "learning_rate": 3.970340405259245e-06, "loss": 0.1136, "num_tokens": 5562970.0, "reward": 0.1685887835919857, "reward_std": 0.17748497053980827, "rewards/code_reward/mean": 0.1685887835919857, "rewards/code_reward/std": 0.17748496308922768, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.25, "completions/max_terminated_length": 595.25, "completions/mean_length": 413.125, "completions/mean_terminated_length": 413.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.019140898278998182, "grad_norm": 1.0311180144750687, "kl": 0.2353515625, "learning_rate": 3.958070390710214e-06, "loss": -0.0245, "num_tokens": 5591150.0, "reward": 0.1419127695262432, "reward_std": 0.12009143829345703, "rewards/code_reward/mean": 0.1419127695262432, "rewards/code_reward/std": 0.12009144574403763, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 454.03125, "completions/mean_terminated_length": 454.03125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.01925283335665314, "grad_norm": 0.9357441509233236, "kl": 0.19580078125, "learning_rate": 3.945749687947109e-06, "loss": -0.0136, "num_tokens": 5620991.0, "reward": 0.06574675627052784, "reward_std": 0.07077404530718923, "rewards/code_reward/mean": 0.06574675627052784, "rewards/code_reward/std": 0.07077404530718923, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.75, "completions/max_terminated_length": 678.75, "completions/mean_length": 487.78125, "completions/mean_terminated_length": 487.78125, "completions/min_length": 288.25, "completions/min_terminated_length": 288.25, "epoch": 0.0193647684343081, "grad_norm": 1.0872281099064747, "kl": 0.210205078125, "learning_rate": 3.933378813921942e-06, "loss": -0.0373, "num_tokens": 5656416.0, "reward": 0.1226367698982358, "reward_std": 0.20265722228214145, "rewards/code_reward/mean": 0.1226367698982358, "rewards/code_reward/std": 0.20265722228214145, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.75, "completions/max_terminated_length": 523.75, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 95.75, "completions/min_terminated_length": 95.75, "epoch": 0.01947670351196306, "grad_norm": 1.0504957433338074, "kl": 0.21142578125, "learning_rate": 3.920958287691811e-06, "loss": -0.0152, "num_tokens": 5680844.0, "reward": 0.4488864839076996, "reward_std": 0.3014371059834957, "rewards/code_reward/mean": 0.4488864839076996, "rewards/code_reward/std": 0.3014371246099472, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 430.875, "completions/mean_terminated_length": 430.875, "completions/min_length": 249.5, "completions/min_terminated_length": 249.5, "epoch": 0.019588638589618023, "grad_norm": 0.7795786053104797, "kl": 0.229736328125, "learning_rate": 3.908488630397121e-06, "loss": 0.0764, "num_tokens": 5713200.0, "reward": 0.04957035928964615, "reward_std": 0.06563462410122156, "rewards/code_reward/mean": 0.04957035928964615, "rewards/code_reward/std": 0.06563462410122156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.75, "completions/max_terminated_length": 609.75, "completions/mean_length": 444.75, "completions/mean_terminated_length": 444.75, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.019700573667272982, "grad_norm": 0.8790829345446359, "kl": 0.202392578125, "learning_rate": 3.8959703652397175e-06, "loss": 0.0125, "num_tokens": 5742760.0, "reward": 0.06789090437814593, "reward_std": 0.10605220403522253, "rewards/code_reward/mean": 0.06789090437814593, "rewards/code_reward/std": 0.10605220403522253, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 921.75, "completions/max_terminated_length": 651.0, "completions/mean_length": 502.5625, "completions/mean_terminated_length": 457.6607208251953, "completions/min_length": 233.75, "completions/min_terminated_length": 233.75, "epoch": 0.019812508744927942, "grad_norm": 0.6856544409285574, "kl": 0.212890625, "learning_rate": 3.883404017460935e-06, "loss": 0.0149, "num_tokens": 5776802.0, "reward": 0.125, "reward_std": 0.13363061845302582, "rewards/code_reward/mean": 0.125, "rewards/code_reward/std": 0.13363061845302582, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.25, "completions/max_terminated_length": 573.25, "completions/mean_length": 400.78125, "completions/mean_terminated_length": 400.78125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.0199244438225829, "grad_norm": 0.7758589030560867, "kl": 0.256591796875, "learning_rate": 3.870790114319559e-06, "loss": -0.0786, "num_tokens": 5804987.0, "reward": 0.2419273192062974, "reward_std": 0.05694087781012058, "rewards/code_reward/mean": 0.2419273192062974, "rewards/code_reward/std": 0.05694088339805603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.25, "completions/max_terminated_length": 721.25, "completions/mean_length": 484.3125, "completions/mean_terminated_length": 484.3125, "completions/min_length": 227.75, "completions/min_terminated_length": 227.75, "epoch": 0.02003637890023786, "grad_norm": 0.8831663988161996, "kl": 0.20166015625, "learning_rate": 3.858129185069701e-06, "loss": -0.0142, "num_tokens": 5838165.0, "reward": 0.15460877772420645, "reward_std": 0.1456797532737255, "rewards/code_reward/mean": 0.15460877772420645, "rewards/code_reward/std": 0.1456797607243061, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 943.5, "completions/max_terminated_length": 603.5, "completions/mean_length": 490.1875, "completions/mean_terminated_length": 443.3526916503906, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.020148313977892823, "grad_norm": 1.051749696084055, "kl": 0.194091796875, "learning_rate": 3.845421760938597e-06, "loss": 0.0364, "num_tokens": 5868107.0, "reward": 0.052815594244748354, "reward_std": 0.11768656317144632, "rewards/code_reward/mean": 0.052815594244748354, "rewards/code_reward/std": 0.11768656317144632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1064.75, "completions/max_terminated_length": 742.5, "completions/mean_length": 527.46875, "completions/mean_terminated_length": 478.9464340209961, "completions/min_length": 230.5, "completions/min_terminated_length": 230.5, "epoch": 0.020260249055547783, "grad_norm": 0.9254371836384011, "kl": 0.224365234375, "learning_rate": 3.832668375104312e-06, "loss": 0.0786, "num_tokens": 5900842.0, "reward": 0.11509167775511742, "reward_std": 0.2528133289888501, "rewards/code_reward/mean": 0.11509167775511742, "rewards/code_reward/std": 0.2528133289888501, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.75, "completions/max_terminated_length": 607.75, "completions/mean_length": 416.75, "completions/mean_terminated_length": 416.75, "completions/min_length": 192.75, "completions/min_terminated_length": 192.75, "epoch": 0.020372184133202742, "grad_norm": 1.1539914541007663, "kl": 0.260009765625, "learning_rate": 3.8198695626733725e-06, "loss": -0.0358, "num_tokens": 5926258.0, "reward": 0.2809056378901005, "reward_std": 0.25853854790329933, "rewards/code_reward/mean": 0.2809056378901005, "rewards/code_reward/std": 0.2585385534912348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 439.6875, "completions/mean_terminated_length": 439.6875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.0204841192108577, "grad_norm": 0.7671721848701089, "kl": 0.207763671875, "learning_rate": 3.8070258606583156e-06, "loss": -0.0237, "num_tokens": 5955016.0, "reward": 0.118256576359272, "reward_std": 0.10753975436091423, "rewards/code_reward/mean": 0.118256576359272, "rewards/code_reward/std": 0.10753976181149483, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.75, "completions/max_terminated_length": 727.75, "completions/mean_length": 514.6875, "completions/mean_terminated_length": 514.6875, "completions/min_length": 295.75, "completions/min_terminated_length": 295.75, "epoch": 0.020596054288512664, "grad_norm": 0.9937661413851208, "kl": 0.22119140625, "learning_rate": 3.7941378079551544e-06, "loss": -0.0669, "num_tokens": 5988158.0, "reward": 0.017067496781237423, "reward_std": 0.02749600470997393, "rewards/code_reward/mean": 0.017067496781237423, "rewards/code_reward/std": 0.027496004942804575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1087.5, "completions/max_terminated_length": 682.0, "completions/mean_length": 524.875, "completions/mean_terminated_length": 471.5446472167969, "completions/min_length": 291.25, "completions/min_terminated_length": 291.25, "epoch": 0.020707989366167624, "grad_norm": 0.6638578813804005, "kl": 0.237060546875, "learning_rate": 3.7812059453207677e-06, "loss": 0.1742, "num_tokens": 6023682.0, "reward": 0.1103343702852726, "reward_std": 0.11875982582569122, "rewards/code_reward/mean": 0.1103343702852726, "rewards/code_reward/std": 0.11875982582569122, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.75, "completions/max_terminated_length": 968.75, "completions/mean_length": 510.34375, "completions/mean_terminated_length": 510.34375, "completions/min_length": 203.5, "completions/min_terminated_length": 203.5, "epoch": 0.020819924443822583, "grad_norm": 1.111764046709291, "kl": 0.22021484375, "learning_rate": 3.768230815350213e-06, "loss": -0.2216, "num_tokens": 6058277.0, "reward": 0.08250047732144594, "reward_std": 0.18678564205765724, "rewards/code_reward/mean": 0.08250047732144594, "rewards/code_reward/std": 0.18678564997389913, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1082.75, "completions/max_terminated_length": 816.5, "completions/mean_length": 579.78125, "completions/mean_terminated_length": 536.25, "completions/min_length": 290.75, "completions/min_terminated_length": 290.75, "epoch": 0.020931859521477542, "grad_norm": 0.6213788285878431, "kl": 0.198974609375, "learning_rate": 3.7552129624539557e-06, "loss": 0.0099, "num_tokens": 6096662.0, "reward": 0.15393732488155365, "reward_std": 0.158139206469059, "rewards/code_reward/mean": 0.15393732488155365, "rewards/code_reward/std": 0.158139206469059, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1410.25, "completions/max_terminated_length": 706.5, "completions/mean_length": 560.1875, "completions/mean_terminated_length": 458.20983123779297, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.0210437945991325, "grad_norm": 0.7369665680166173, "kl": 0.207275390625, "learning_rate": 3.7421529328350316e-06, "loss": 0.0253, "num_tokens": 6130348.0, "reward": 0.016329039994161576, "reward_std": 0.015793586208019406, "rewards/code_reward/mean": 0.016329039994161576, "rewards/code_reward/std": 0.01579358527669683, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.75, "completions/max_terminated_length": 808.75, "completions/mean_length": 575.625, "completions/mean_terminated_length": 575.625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.021155729676787464, "grad_norm": 0.8246271999593108, "kl": 0.204345703125, "learning_rate": 3.7290512744661274e-06, "loss": 0.0457, "num_tokens": 6171304.0, "reward": 0.07762476638890803, "reward_std": 0.16000637132674456, "rewards/code_reward/mean": 0.07762476638890803, "rewards/code_reward/std": 0.16000637412071228, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1693.5, "completions/max_terminated_length": 887.75, "completions/mean_length": 880.3125, "completions/mean_terminated_length": 621.9541931152344, "completions/min_length": 358.75, "completions/min_terminated_length": 358.75, "epoch": 0.021267664754442424, "grad_norm": 0.7247939676119982, "kl": 0.166748046875, "learning_rate": 3.715908537066589e-06, "loss": -0.0747, "num_tokens": 6218770.0, "reward": 0.18021205358672887, "reward_std": 0.20904676476493478, "rewards/code_reward/mean": 0.18021205358672887, "rewards/code_reward/std": 0.2090467723319307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1439.5, "completions/max_terminated_length": 634.25, "completions/mean_length": 787.9375, "completions/mean_terminated_length": 455.0029830932617, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.021379599832097383, "grad_norm": 0.5466790904917126, "kl": 0.152587890625, "learning_rate": 3.7027252720793538e-06, "loss": 0.1295, "num_tokens": 6262056.0, "reward": 0.193359375, "reward_std": 0.15328529477119446, "rewards/code_reward/mean": 0.193359375, "rewards/code_reward/std": 0.15328530967235565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1206.75, "completions/max_terminated_length": 850.0, "completions/mean_length": 588.03125, "completions/mean_terminated_length": 537.1607208251953, "completions/min_length": 300.25, "completions/min_terminated_length": 300.25, "epoch": 0.021491534909752343, "grad_norm": 0.8256952276538428, "kl": 0.239501953125, "learning_rate": 3.689502032647817e-06, "loss": -0.1993, "num_tokens": 6310129.0, "reward": 0.11067206133157015, "reward_std": 0.11477606277912855, "rewards/code_reward/mean": 0.11067206133157015, "rewards/code_reward/std": 0.11477606697008014, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1253.5, "completions/max_terminated_length": 952.0, "completions/mean_length": 544.9375, "completions/mean_terminated_length": 496.7276840209961, "completions/min_length": 312.75, "completions/min_terminated_length": 312.75, "epoch": 0.021603469987407305, "grad_norm": 0.9219945943560189, "kl": 0.228759765625, "learning_rate": 3.6762393735926245e-06, "loss": 0.0478, "num_tokens": 6343727.0, "reward": 0.08743459376273677, "reward_std": 0.060669250786304474, "rewards/code_reward/mean": 0.08743459376273677, "rewards/code_reward/std": 0.06066925637423992, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.75, "completions/max_terminated_length": 747.75, "completions/mean_length": 467.28125, "completions/mean_terminated_length": 467.28125, "completions/min_length": 274.25, "completions/min_terminated_length": 274.25, "epoch": 0.021715405065062265, "grad_norm": 1.087269428253631, "kl": 0.224853515625, "learning_rate": 3.6629378513883852e-06, "loss": -0.0435, "num_tokens": 6369656.0, "reward": 0.01468671576003544, "reward_std": 0.0163727342733182, "rewards/code_reward/mean": 0.01468671576003544, "rewards/code_reward/std": 0.0163727342733182, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1376.0, "completions/max_terminated_length": 711.5, "completions/mean_length": 528.34375, "completions/mean_terminated_length": 428.1741180419922, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.021827340142717224, "grad_norm": 0.877705585064893, "kl": 0.191162109375, "learning_rate": 3.6495980241403307e-06, "loss": -0.0435, "num_tokens": 6402635.0, "reward": 0.28124301601201296, "reward_std": 0.1287369872443378, "rewards/code_reward/mean": 0.28124301601201296, "rewards/code_reward/std": 0.1287369979545474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1505.75, "completions/max_terminated_length": 884.0, "completions/mean_length": 629.875, "completions/mean_terminated_length": 531.4151840209961, "completions/min_length": 338.5, "completions/min_terminated_length": 338.5, "epoch": 0.021939275220372183, "grad_norm": 0.7532827929414431, "kl": 0.193115234375, "learning_rate": 3.636220451560896e-06, "loss": 0.067, "num_tokens": 6441607.0, "reward": 0.07744654751149938, "reward_std": 0.049496792489662766, "rewards/code_reward/mean": 0.07744654751149938, "rewards/code_reward/std": 0.04949679644778371, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1397.75, "completions/max_terminated_length": 630.75, "completions/mean_length": 608.0, "completions/mean_terminated_length": 451.27679443359375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.022051210298027143, "grad_norm": 0.8485874245870415, "kl": 0.191162109375, "learning_rate": 3.622805694946235e-06, "loss": -0.1349, "num_tokens": 6479383.0, "reward": 0.26938944309949875, "reward_std": 0.26605916023254395, "rewards/code_reward/mean": 0.26938944309949875, "rewards/code_reward/std": 0.26605917513370514, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1237.75, "completions/max_terminated_length": 865.5, "completions/mean_length": 552.3125, "completions/mean_terminated_length": 501.0982208251953, "completions/min_length": 266.25, "completions/min_terminated_length": 266.25, "epoch": 0.022163145375682106, "grad_norm": 0.824562163366152, "kl": 0.19140625, "learning_rate": 3.609354317152667e-06, "loss": -0.025, "num_tokens": 6511257.0, "reward": 0.05427030206192285, "reward_std": 0.047577258897945285, "rewards/code_reward/mean": 0.05427030206192285, "rewards/code_reward/std": 0.04757725913077593, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1021.5, "completions/max_terminated_length": 694.75, "completions/mean_length": 498.78125, "completions/mean_terminated_length": 447.3214340209961, "completions/min_length": 226.5, "completions/min_terminated_length": 226.5, "epoch": 0.022275080453337065, "grad_norm": 1.0082207311300992, "kl": 0.229736328125, "learning_rate": 3.595866882573063e-06, "loss": 0.0149, "num_tokens": 6541178.0, "reward": 0.2747242748737335, "reward_std": 0.2067141029983759, "rewards/code_reward/mean": 0.2747242748737335, "rewards/code_reward/std": 0.20671410486102104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 980.0, "completions/max_terminated_length": 573.25, "completions/mean_length": 469.53125, "completions/mean_terminated_length": 416.4330368041992, "completions/min_length": 246.5, "completions/min_terminated_length": 246.5, "epoch": 0.022387015530992024, "grad_norm": 0.9952889000470718, "kl": 0.216064453125, "learning_rate": 3.5823439571131675e-06, "loss": -0.1906, "num_tokens": 6570243.0, "reward": 0.1142054102383554, "reward_std": 0.18550929613411427, "rewards/code_reward/mean": 0.1142054102383554, "rewards/code_reward/std": 0.18550931010395288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.25, "completions/max_terminated_length": 764.25, "completions/mean_length": 500.96875, "completions/mean_terminated_length": 500.96875, "completions/min_length": 323.5, "completions/min_terminated_length": 323.5, "epoch": 0.022498950608646984, "grad_norm": 0.9455318918616956, "kl": 0.227294921875, "learning_rate": 3.5687861081678477e-06, "loss": 0.031, "num_tokens": 6603946.0, "reward": 0.16931893583387136, "reward_std": 0.20531310141086578, "rewards/code_reward/mean": 0.16931893583387136, "rewards/code_reward/std": 0.20531310513615608, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1030.5, "completions/max_terminated_length": 688.5, "completions/mean_length": 418.1875, "completions/mean_terminated_length": 367.9821472167969, "completions/min_length": 194.75, "completions/min_terminated_length": 194.75, "epoch": 0.022610885686301947, "grad_norm": 1.0325105547845679, "kl": 0.2138671875, "learning_rate": 3.555193904597291e-06, "loss": 0.0613, "num_tokens": 6636552.0, "reward": 0.3791414946317673, "reward_std": 0.17875608056783676, "rewards/code_reward/mean": 0.3791414946317673, "rewards/code_reward/std": 0.17875608801841736, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.25, "completions/max_terminated_length": 754.25, "completions/mean_length": 455.34375, "completions/mean_terminated_length": 455.34375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.022722820763956906, "grad_norm": 0.9727989512601006, "kl": 0.22119140625, "learning_rate": 3.541567916703138e-06, "loss": 0.0159, "num_tokens": 6668595.0, "reward": 0.19191165082156658, "reward_std": 0.15741402097046375, "rewards/code_reward/mean": 0.19191165082156658, "rewards/code_reward/std": 0.15741403214633465, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 989.25, "completions/max_terminated_length": 615.5, "completions/mean_length": 476.40625, "completions/mean_terminated_length": 425.71875762939453, "completions/min_length": 255.25, "completions/min_terminated_length": 255.25, "epoch": 0.022834755841611865, "grad_norm": 0.7440983993424026, "kl": 0.203857421875, "learning_rate": 3.5279087162045517e-06, "loss": 0.0571, "num_tokens": 6702376.0, "reward": 0.11487132962793112, "reward_std": 0.1093948557972908, "rewards/code_reward/mean": 0.11487132962793112, "rewards/code_reward/std": 0.1093948557972908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.25, "completions/max_terminated_length": 861.25, "completions/mean_length": 514.59375, "completions/mean_terminated_length": 514.59375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.022946690919266825, "grad_norm": 0.9601225973651024, "kl": 0.18505859375, "learning_rate": 3.5142168762142265e-06, "loss": 0.0168, "num_tokens": 6739939.0, "reward": 0.07519801473245025, "reward_std": 0.09981238306500018, "rewards/code_reward/mean": 0.07519801473245025, "rewards/code_reward/std": 0.09981238329783082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 914.25, "completions/max_terminated_length": 670.75, "completions/mean_length": 427.03125, "completions/mean_terminated_length": 379.75, "completions/min_length": 184.25, "completions/min_terminated_length": 184.25, "epoch": 0.023058625996921784, "grad_norm": 0.8984794673889067, "kl": 0.207763671875, "learning_rate": 3.500492971214347e-06, "loss": 0.1382, "num_tokens": 6769180.0, "reward": 0.22681757621467113, "reward_std": 0.20832497254014015, "rewards/code_reward/mean": 0.22681757621467113, "rewards/code_reward/std": 0.20832498744130135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1280.25, "completions/max_terminated_length": 1128.0, "completions/mean_length": 650.65625, "completions/mean_terminated_length": 608.4821472167969, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.023170561074576747, "grad_norm": 0.7922918264942025, "kl": 0.1728515625, "learning_rate": 3.48673757703248e-06, "loss": -0.0944, "num_tokens": 6805289.0, "reward": 0.14009581343270838, "reward_std": 0.11429419624619186, "rewards/code_reward/mean": 0.14009581343270838, "rewards/code_reward/std": 0.11429420742206275, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1129.5, "completions/max_terminated_length": 776.25, "completions/mean_length": 539.34375, "completions/mean_terminated_length": 491.6026916503906, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.023282496152231706, "grad_norm": 0.9119324704049495, "kl": 0.1783447265625, "learning_rate": 3.472951270817418e-06, "loss": -0.064, "num_tokens": 6837436.0, "reward": 0.05461701576132327, "reward_std": 0.09534355666255578, "rewards/code_reward/mean": 0.05461701576132327, "rewards/code_reward/std": 0.09534356038784608, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.25, "completions/max_terminated_length": 903.25, "completions/mean_length": 384.75, "completions/mean_terminated_length": 384.75, "completions/min_length": 159.75, "completions/min_terminated_length": 159.75, "epoch": 0.023394431229886666, "grad_norm": 1.2240979756453376, "kl": 0.183349609375, "learning_rate": 3.4591346310149578e-06, "loss": 0.0503, "num_tokens": 6864492.0, "reward": 0.4224591121310368, "reward_std": 0.2887880225898698, "rewards/code_reward/mean": 0.4224591121310368, "rewards/code_reward/std": 0.2887880523921922, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.75, "completions/max_terminated_length": 767.75, "completions/mean_length": 399.28125, "completions/mean_terminated_length": 399.28125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.023506366307541625, "grad_norm": 1.1994406910258044, "kl": 0.204833984375, "learning_rate": 3.445288237343632e-06, "loss": -0.0509, "num_tokens": 6891213.0, "reward": 0.09320073015987873, "reward_std": 0.12864024192094803, "rewards/code_reward/mean": 0.09320073015987873, "rewards/code_reward/std": 0.12864025123417377, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1011.25, "completions/max_terminated_length": 669.75, "completions/mean_length": 461.25, "completions/mean_terminated_length": 410.2276840209961, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.023618301385196588, "grad_norm": 1.1498675164259378, "kl": 0.2158203125, "learning_rate": 3.4314126707703895e-06, "loss": 0.0824, "num_tokens": 6919749.0, "reward": 0.2663097037002444, "reward_std": 0.21830029226839542, "rewards/code_reward/mean": 0.2663097037002444, "rewards/code_reward/std": 0.21830029599368572, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 868.25, "completions/max_terminated_length": 621.75, "completions/mean_length": 385.84375, "completions/mean_terminated_length": 338.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.023730236462851547, "grad_norm": 1.220829227031836, "kl": 0.20703125, "learning_rate": 3.4175085134862128e-06, "loss": 0.1624, "num_tokens": 6948192.0, "reward": 0.27685857750475407, "reward_std": 0.24184924457222223, "rewards/code_reward/mean": 0.27685857750475407, "rewards/code_reward/std": 0.24184925481677055, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.25, "completions/max_terminated_length": 540.25, "completions/mean_length": 334.1875, "completions/mean_terminated_length": 334.1875, "completions/min_length": 196.25, "completions/min_terminated_length": 196.25, "epoch": 0.023842171540506506, "grad_norm": 1.0550764471568521, "kl": 0.224365234375, "learning_rate": 3.4035763488816953e-06, "loss": 0.1182, "num_tokens": 6973222.0, "reward": 0.5495182275772095, "reward_std": 0.17330202460289001, "rewards/code_reward/mean": 0.5495182275772095, "rewards/code_reward/std": 0.17330202646553516, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 360.46875, "completions/mean_terminated_length": 360.46875, "completions/min_length": 144.75, "completions/min_terminated_length": 144.75, "epoch": 0.023954106618161466, "grad_norm": 1.0108308143941853, "kl": 0.258544921875, "learning_rate": 3.3896167615225594e-06, "loss": 0.0636, "num_tokens": 6998765.0, "reward": 0.13169488031417131, "reward_std": 0.07389534078538418, "rewards/code_reward/mean": 0.13169488031417131, "rewards/code_reward/std": 0.07389534404501319, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1026.5, "completions/max_terminated_length": 619.5, "completions/mean_length": 401.0, "completions/mean_terminated_length": 346.1383972167969, "completions/min_length": 137.25, "completions/min_terminated_length": 137.25, "epoch": 0.024066041695816425, "grad_norm": 0.750694546999131, "kl": 0.19580078125, "learning_rate": 3.375630337125133e-06, "loss": 0.1223, "num_tokens": 7028501.0, "reward": 0.07791783940047026, "reward_std": 0.08552672585938126, "rewards/code_reward/mean": 0.07791783940047026, "rewards/code_reward/std": 0.0855267186416313, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 329.9375, "completions/mean_terminated_length": 329.9375, "completions/min_length": 163.75, "completions/min_terminated_length": 163.75, "epoch": 0.024177976773471388, "grad_norm": 0.9861460225219651, "kl": 0.20947265625, "learning_rate": 3.361617662531772e-06, "loss": 0.0307, "num_tokens": 7059667.0, "reward": 0.28345959074795246, "reward_std": 0.10491538979113102, "rewards/code_reward/mean": 0.28345959074795246, "rewards/code_reward/std": 0.10491538792848587, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.5, "completions/max_terminated_length": 487.5, "completions/mean_length": 300.9375, "completions/mean_terminated_length": 300.9375, "completions/min_length": 150.5, "completions/min_terminated_length": 150.5, "epoch": 0.024289911851126347, "grad_norm": 1.2867180675052194, "kl": 0.19677734375, "learning_rate": 3.347579325686237e-06, "loss": 0.0498, "num_tokens": 7084721.0, "reward": 0.38907771836966276, "reward_std": 0.32206146977841854, "rewards/code_reward/mean": 0.38907771836966276, "rewards/code_reward/std": 0.3220614865422249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.75, "completions/max_terminated_length": 508.75, "completions/mean_length": 332.0625, "completions/mean_terminated_length": 332.0625, "completions/min_length": 201.25, "completions/min_terminated_length": 201.25, "epoch": 0.024401846928781307, "grad_norm": 1.221746654434671, "kl": 0.192626953125, "learning_rate": 3.333515915609027e-06, "loss": -0.0326, "num_tokens": 7112387.0, "reward": 0.05860341805964708, "reward_std": 0.07969626039266586, "rewards/code_reward/mean": 0.05860341805964708, "rewards/code_reward/std": 0.07969625853002071, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.25, "completions/max_terminated_length": 567.25, "completions/mean_length": 317.25, "completions/mean_terminated_length": 317.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.024513782006436266, "grad_norm": 1.0695893302551942, "kl": 0.23388671875, "learning_rate": 3.3194280223726616e-06, "loss": 0.027, "num_tokens": 7138323.0, "reward": 0.14415738731622696, "reward_std": 0.14080366492271423, "rewards/code_reward/mean": 0.14415738731622696, "rewards/code_reward/std": 0.14080367609858513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.5, "completions/max_terminated_length": 387.5, "completions/mean_length": 239.6875, "completions/mean_terminated_length": 239.6875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.024625717084091225, "grad_norm": 1.763480550756643, "kl": 0.20849609375, "learning_rate": 3.305316237076927e-06, "loss": -0.1439, "num_tokens": 7159529.0, "reward": 0.10015321767423302, "reward_std": 0.17220470518805087, "rewards/code_reward/mean": 0.10015321767423302, "rewards/code_reward/std": 0.17220470635220408, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.5, "completions/max_terminated_length": 487.5, "completions/mean_length": 327.15625, "completions/mean_terminated_length": 327.15625, "completions/min_length": 177.5, "completions/min_terminated_length": 177.5, "epoch": 0.024737652161746188, "grad_norm": 0.8169319139286209, "kl": 0.1607666015625, "learning_rate": 3.291181151824071e-06, "loss": 0.0895, "num_tokens": 7191342.0, "reward": 0.1822916641831398, "reward_std": 0.2553221881389618, "rewards/code_reward/mean": 0.1822916641831398, "rewards/code_reward/std": 0.2553221881389618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.25, "completions/max_terminated_length": 436.25, "completions/mean_length": 269.84375, "completions/mean_terminated_length": 269.84375, "completions/min_length": 153.25, "completions/min_terminated_length": 153.25, "epoch": 0.024849587239401148, "grad_norm": 1.1996415284846553, "kl": 0.2158203125, "learning_rate": 3.27702335969396e-06, "loss": -0.0201, "num_tokens": 7216737.0, "reward": 0.10358373820781708, "reward_std": 0.1254219285910949, "rewards/code_reward/mean": 0.10358373820781708, "rewards/code_reward/std": 0.1254219323163852, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.5, "completions/max_terminated_length": 349.5, "completions/mean_length": 217.03125, "completions/mean_terminated_length": 217.03125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.024961522317056107, "grad_norm": 1.2608189223620252, "kl": 0.2314453125, "learning_rate": 3.2628434547191985e-06, "loss": 0.0994, "num_tokens": 7235498.0, "reward": 0.14160977257415652, "reward_std": 0.0918192695826292, "rewards/code_reward/mean": 0.14160977257415652, "rewards/code_reward/std": 0.0918192733079195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 223.5625, "completions/mean_terminated_length": 223.5625, "completions/min_length": 149.25, "completions/min_terminated_length": 149.25, "epoch": 0.025073457394711066, "grad_norm": 1.600431868786668, "kl": 0.225341796875, "learning_rate": 3.2486420318601973e-06, "loss": 0.0364, "num_tokens": 7262236.0, "reward": 0.27536666474770755, "reward_std": 0.14216232020407915, "rewards/code_reward/mean": 0.27536666474770755, "rewards/code_reward/std": 0.14216232066974044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.75, "completions/max_terminated_length": 559.75, "completions/mean_length": 338.53125, "completions/mean_terminated_length": 338.53125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.02518539247236603, "grad_norm": 0.8719466687856433, "kl": 0.18359375, "learning_rate": 3.2344196869802187e-06, "loss": 0.0315, "num_tokens": 7298189.0, "reward": 0.015560166910290718, "reward_std": 0.012789241969585419, "rewards/code_reward/mean": 0.015560166910290718, "rewards/code_reward/std": 0.012789241969585419, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.25, "completions/max_terminated_length": 465.25, "completions/mean_length": 276.65625, "completions/mean_terminated_length": 276.65625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.02529732755002099, "grad_norm": 1.0186655813271028, "kl": 0.19287109375, "learning_rate": 3.2201770168203694e-06, "loss": 0.115, "num_tokens": 7334746.0, "reward": 0.10499188816174865, "reward_std": 0.10288760857656598, "rewards/code_reward/mean": 0.10499188816174865, "rewards/code_reward/std": 0.10288760880939662, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.75, "completions/max_terminated_length": 314.75, "completions/mean_length": 205.90625, "completions/mean_terminated_length": 205.90625, "completions/min_length": 130.75, "completions/min_terminated_length": 130.75, "epoch": 0.025409262627675948, "grad_norm": 0.7605637028441592, "kl": 0.16796875, "learning_rate": 3.205914618974563e-06, "loss": 0.0079, "num_tokens": 7353919.0, "reward": 0.01245777029544115, "reward_std": 0.01250904705375433, "rewards/code_reward/mean": 0.01245777029544115, "rewards/code_reward/std": 0.01250904705375433, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 191.46875, "completions/mean_terminated_length": 191.46875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.025521197705330907, "grad_norm": 1.2858515605192504, "kl": 0.259765625, "learning_rate": 3.1916330918644496e-06, "loss": 0.0768, "num_tokens": 7377150.0, "reward": 0.12344044167548418, "reward_std": 0.12209718860685825, "rewards/code_reward/mean": 0.12344044167548418, "rewards/code_reward/std": 0.12209718953818083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 226.6875, "completions/mean_terminated_length": 226.6875, "completions/min_length": 149.75, "completions/min_terminated_length": 149.75, "epoch": 0.025633132782985867, "grad_norm": 0.9529571983949628, "kl": 0.20458984375, "learning_rate": 3.177333034714303e-06, "loss": -0.0135, "num_tokens": 7402444.0, "reward": 0.06041666865348816, "reward_std": 0.038540102541446686, "rewards/code_reward/mean": 0.06041666865348816, "rewards/code_reward/std": 0.038540102541446686, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.75, "completions/max_terminated_length": 312.75, "completions/mean_length": 147.1875, "completions/mean_terminated_length": 147.1875, "completions/min_length": 67.25, "completions/min_terminated_length": 67.25, "epoch": 0.02574506786064083, "grad_norm": 1.403338545743401, "kl": 0.30029296875, "learning_rate": 3.1630150475258813e-06, "loss": 0.0342, "num_tokens": 7429962.0, "reward": 0.0703125, "reward_std": 0.11608850955963135, "rewards/code_reward/mean": 0.0703125, "rewards/code_reward/std": 0.1160885114222765, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 183.46875, "completions/mean_terminated_length": 183.46875, "completions/min_length": 92.75, "completions/min_terminated_length": 92.75, "epoch": 0.02585700293829579, "grad_norm": 1.2487410703105084, "kl": 0.259765625, "learning_rate": 3.148679731053252e-06, "loss": -0.0378, "num_tokens": 7455169.0, "reward": 0.21321137621998787, "reward_std": 0.2805868834257126, "rewards/code_reward/mean": 0.21321137621998787, "rewards/code_reward/std": 0.2805868834257126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.25, "completions/max_terminated_length": 303.25, "completions/mean_length": 153.84375, "completions/mean_terminated_length": 153.84375, "completions/min_length": 89.75, "completions/min_terminated_length": 89.75, "epoch": 0.025968938015950748, "grad_norm": 1.0579237191929745, "kl": 0.232421875, "learning_rate": 3.1343276867775805e-06, "loss": 0.0811, "num_tokens": 7480004.0, "reward": 0.1274509804788977, "reward_std": 0.2177756354212761, "rewards/code_reward/mean": 0.1274509804788977, "rewards/code_reward/std": 0.2177756503224373, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.75, "completions/max_terminated_length": 246.75, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 100.5, "completions/min_terminated_length": 100.5, "epoch": 0.026080873093605707, "grad_norm": 1.373681707551141, "kl": 0.2509765625, "learning_rate": 3.1199595168819043e-06, "loss": 0.005, "num_tokens": 7508034.0, "reward": 0.2599347122013569, "reward_std": 0.22181765362620354, "rewards/code_reward/mean": 0.2599347122013569, "rewards/code_reward/std": 0.22181766107678413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.25, "completions/max_terminated_length": 369.25, "completions/mean_length": 237.15625, "completions/mean_terminated_length": 237.15625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.02619280817126067, "grad_norm": 1.449039323532629, "kl": 0.205322265625, "learning_rate": 3.105575824225852e-06, "loss": 0.0024, "num_tokens": 7536911.0, "reward": 0.17171062319539487, "reward_std": 0.1438203388825059, "rewards/code_reward/mean": 0.17171062319539487, "rewards/code_reward/std": 0.1438203463330865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.75, "completions/max_terminated_length": 345.75, "completions/mean_length": 170.21875, "completions/mean_terminated_length": 170.21875, "completions/min_length": 95.75, "completions/min_terminated_length": 95.75, "epoch": 0.02630474324891563, "grad_norm": 1.0698079544355648, "kl": 0.1624755859375, "learning_rate": 3.091177212320363e-06, "loss": -0.1894, "num_tokens": 7554142.0, "reward": 0.3125, "reward_std": 0.1462521031498909, "rewards/code_reward/mean": 0.3125, "rewards/code_reward/std": 0.1462521031498909, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 175.46875, "completions/mean_terminated_length": 175.46875, "completions/min_length": 112.75, "completions/min_terminated_length": 112.75, "epoch": 0.02641667832657059, "grad_norm": 1.4540255844594339, "kl": 0.2158203125, "learning_rate": 3.0767642853023538e-06, "loss": -0.0223, "num_tokens": 7584357.0, "reward": 0.21975820884108543, "reward_std": 0.1363154649734497, "rewards/code_reward/mean": 0.21975820884108543, "rewards/code_reward/std": 0.1363154649734497, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.25, "completions/max_terminated_length": 228.25, "completions/mean_length": 116.59375, "completions/mean_terminated_length": 116.59375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.02652861340422555, "grad_norm": 1.9465041168415773, "kl": 0.247802734375, "learning_rate": 3.062337647909376e-06, "loss": -0.039, "num_tokens": 7602040.0, "reward": 0.4692905358970165, "reward_std": 0.24660581722855568, "rewards/code_reward/mean": 0.4692905358970165, "rewards/code_reward/std": 0.24660583958029747, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.75, "completions/max_terminated_length": 583.75, "completions/mean_length": 281.21875, "completions/mean_terminated_length": 281.21875, "completions/min_length": 133.5, "completions/min_terminated_length": 133.5, "epoch": 0.026640548481880508, "grad_norm": 1.333783365108213, "kl": 0.1785888671875, "learning_rate": 3.04789790545424e-06, "loss": 0.0396, "num_tokens": 7627319.0, "reward": 0.17131002363748848, "reward_std": 0.18950149056036025, "rewards/code_reward/mean": 0.17131002363748848, "rewards/code_reward/std": 0.18950149248121306, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.5, "completions/max_terminated_length": 231.5, "completions/mean_length": 128.28125, "completions/mean_terminated_length": 128.28125, "completions/min_length": 69.25, "completions/min_terminated_length": 69.25, "epoch": 0.02675248355953547, "grad_norm": 0.8510436652715602, "kl": 0.250732421875, "learning_rate": 3.033445663799621e-06, "loss": -0.0327, "num_tokens": 7644360.0, "reward": 0.07549504935741425, "reward_std": 0.07982433587312698, "rewards/code_reward/mean": 0.07549504935741425, "rewards/code_reward/std": 0.07982433587312698, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.25, "completions/max_terminated_length": 198.25, "completions/mean_length": 118.28125, "completions/mean_terminated_length": 118.28125, "completions/min_length": 79.5, "completions/min_terminated_length": 79.5, "epoch": 0.02686441863719043, "grad_norm": 2.420152417522015, "kl": 0.25146484375, "learning_rate": 3.018981529332633e-06, "loss": 0.0544, "num_tokens": 7661793.0, "reward": 0.07239184161880985, "reward_std": 0.05399157607462257, "rewards/code_reward/mean": 0.07239184161880985, "rewards/code_reward/std": 0.05399157712236047, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 212.125, "completions/mean_terminated_length": 212.125, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "epoch": 0.02697635371484539, "grad_norm": 1.3615781255434425, "kl": 0.1885986328125, "learning_rate": 3.00450610893939e-06, "loss": 0.0817, "num_tokens": 7685573.0, "reward": 0.2050044471397996, "reward_std": 0.1304325871169567, "rewards/code_reward/mean": 0.2050044471397996, "rewards/code_reward/std": 0.13043258781544864, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.75, "completions/max_terminated_length": 230.75, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "epoch": 0.02708828879250035, "grad_norm": 1.4458971248752004, "kl": 0.3115234375, "learning_rate": 2.9900200099795396e-06, "loss": 0.1362, "num_tokens": 7711785.0, "reward": 0.12025879789143801, "reward_std": 0.10159321606624871, "rewards/code_reward/mean": 0.12025879789143801, "rewards/code_reward/std": 0.10159321606624871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.5, "completions/max_terminated_length": 615.5, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 117.75, "completions/min_terminated_length": 117.75, "epoch": 0.02720022387015531, "grad_norm": 0.9925754447279824, "kl": 0.1759033203125, "learning_rate": 2.9755238402607826e-06, "loss": 0.0145, "num_tokens": 7736909.0, "reward": 0.2357253096997738, "reward_std": 0.11498994007706642, "rewards/code_reward/mean": 0.2357253096997738, "rewards/code_reward/std": 0.11498994193971157, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.25, "completions/max_terminated_length": 201.25, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 68.75, "completions/min_terminated_length": 68.75, "epoch": 0.02731215894781027, "grad_norm": 1.7694959761730793, "kl": 0.1773681640625, "learning_rate": 2.961018208013367e-06, "loss": 0.0806, "num_tokens": 7753785.0, "reward": 0.26032672822475433, "reward_std": 0.2158903395757079, "rewards/code_reward/mean": 0.26032672822475433, "rewards/code_reward/std": 0.21589034423232079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.5, "completions/max_terminated_length": 259.5, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 77.75, "completions/min_terminated_length": 77.75, "epoch": 0.02742409402546523, "grad_norm": 1.5239142070937466, "kl": 0.1982421875, "learning_rate": 2.9465037218645694e-06, "loss": 0.0341, "num_tokens": 7770921.0, "reward": 0.15353127755224705, "reward_std": 0.1622099713422358, "rewards/code_reward/mean": 0.15353127755224705, "rewards/code_reward/std": 0.16220997110940516, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.25, "completions/max_terminated_length": 193.25, "completions/mean_length": 129.40625, "completions/mean_terminated_length": 129.40625, "completions/min_length": 83.75, "completions/min_terminated_length": 83.75, "epoch": 0.02753602910312019, "grad_norm": 1.4700135521247835, "kl": 0.311767578125, "learning_rate": 2.9319809908131604e-06, "loss": -0.0235, "num_tokens": 7793438.0, "reward": 0.22987624257802963, "reward_std": 0.19782325625419617, "rewards/code_reward/mean": 0.22987624257802963, "rewards/code_reward/std": 0.19782325625419617, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.25, "completions/max_terminated_length": 221.25, "completions/mean_length": 132.5625, "completions/mean_terminated_length": 132.5625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.02764796418077515, "grad_norm": 0.7938111312829735, "kl": 0.2685546875, "learning_rate": 2.917450624203847e-06, "loss": 0.0108, "num_tokens": 7811344.0, "reward": 0.1285112500190735, "reward_std": 0.03530046343803406, "rewards/code_reward/mean": 0.1285112500190735, "rewards/code_reward/std": 0.03530046343803406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 72.75, "completions/min_terminated_length": 72.75, "epoch": 0.02775989925843011, "grad_norm": 1.461851676320045, "kl": 0.321533203125, "learning_rate": 2.9029132317017118e-06, "loss": 0.0822, "num_tokens": 7836400.0, "reward": 0.07058638549642637, "reward_std": 0.09412376256659627, "rewards/code_reward/mean": 0.07058638549642637, "rewards/code_reward/std": 0.09412377001717687, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.25, "completions/max_terminated_length": 184.25, "completions/mean_length": 131.59375, "completions/mean_terminated_length": 131.59375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.02787183433608507, "grad_norm": 1.6504030899968662, "kl": 0.275634765625, "learning_rate": 2.888369423266629e-06, "loss": 0.0701, "num_tokens": 7857059.0, "reward": 0.059203914599493146, "reward_std": 0.09994567523244768, "rewards/code_reward/mean": 0.059203914599493146, "rewards/code_reward/std": 0.09994568361435086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.5, "completions/max_terminated_length": 415.5, "completions/mean_length": 206.15625, "completions/mean_terminated_length": 206.15625, "completions/min_length": 104.25, "completions/min_terminated_length": 104.25, "epoch": 0.02798376941374003, "grad_norm": 1.159655895359276, "kl": 0.165283203125, "learning_rate": 2.8738198091276712e-06, "loss": -0.0308, "num_tokens": 7882080.0, "reward": 0.10000000149011612, "reward_std": 0.1331607922911644, "rewards/code_reward/mean": 0.10000000149011612, "rewards/code_reward/std": 0.1331607922911644, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.25, "completions/max_terminated_length": 266.25, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.02809570449139499, "grad_norm": 1.3860909228149765, "kl": 0.3243408203125, "learning_rate": 2.859264999757509e-06, "loss": -0.0087, "num_tokens": 7904552.0, "reward": 0.17192643135786057, "reward_std": 0.12011632975190878, "rewards/code_reward/mean": 0.17192643135786057, "rewards/code_reward/std": 0.12011633953079581, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 111.34375, "completions/mean_terminated_length": 111.34375, "completions/min_length": 78.25, "completions/min_terminated_length": 78.25, "epoch": 0.028207639569049953, "grad_norm": 2.5730420152805027, "kl": 0.35986328125, "learning_rate": 2.8447056058467928e-06, "loss": -0.0585, "num_tokens": 7929307.0, "reward": 0.17651335208211094, "reward_std": 0.22341035841964185, "rewards/code_reward/mean": 0.17651335208211094, "rewards/code_reward/std": 0.22341035841964185, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.5, "completions/max_terminated_length": 225.5, "completions/mean_length": 145.40625, "completions/mean_terminated_length": 145.40625, "completions/min_length": 86.25, "completions/min_terminated_length": 86.25, "epoch": 0.028319574646704912, "grad_norm": 1.8094443062077088, "kl": 0.36767578125, "learning_rate": 2.830142238278531e-06, "loss": 0.0709, "num_tokens": 7952504.0, "reward": 0.2017338698497042, "reward_std": 0.20673675020225346, "rewards/code_reward/mean": 0.2017338698497042, "rewards/code_reward/std": 0.20673674996942282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.25, "completions/max_terminated_length": 203.25, "completions/mean_length": 119.84375, "completions/mean_terminated_length": 119.84375, "completions/min_length": 75.75, "completions/min_terminated_length": 75.75, "epoch": 0.02843150972435987, "grad_norm": 2.3402410563756315, "kl": 0.354736328125, "learning_rate": 2.81557550810246e-06, "loss": -0.0806, "num_tokens": 7976539.0, "reward": 0.3907702271826565, "reward_std": 0.26296099089086056, "rewards/code_reward/mean": 0.3907702271826565, "rewards/code_reward/std": 0.26296099927276373, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 128.8125, "completions/mean_terminated_length": 128.8125, "completions/min_length": 72.5, "completions/min_terminated_length": 72.5, "epoch": 0.02854344480201483, "grad_norm": 1.6470365991478257, "kl": 0.3701171875, "learning_rate": 2.8010060265094026e-06, "loss": 0.0623, "num_tokens": 7998165.0, "reward": 0.14626706298440695, "reward_std": 0.13755429768934846, "rewards/code_reward/mean": 0.14626706298440695, "rewards/code_reward/std": 0.13755429675802588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 67.5, "completions/min_terminated_length": 67.5, "epoch": 0.02865537987966979, "grad_norm": 1.5339855216333798, "kl": 0.224365234375, "learning_rate": 2.786434404805629e-06, "loss": 0.0387, "num_tokens": 8031691.0, "reward": 0.09155143890529871, "reward_std": 0.09001913899555802, "rewards/code_reward/mean": 0.09155143890529871, "rewards/code_reward/std": 0.09001914283726364, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 145.6875, "completions/mean_terminated_length": 145.6875, "completions/min_length": 88.75, "completions/min_terminated_length": 88.75, "epoch": 0.028767314957324753, "grad_norm": 1.8276516553873643, "kl": 0.37451171875, "learning_rate": 2.771861254387199e-06, "loss": -0.0261, "num_tokens": 8052065.0, "reward": 0.21322817541658878, "reward_std": 0.16002243757247925, "rewards/code_reward/mean": 0.21322817541658878, "rewards/code_reward/std": 0.16002243757247925, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.75, "completions/max_terminated_length": 243.75, "completions/mean_length": 164.90625, "completions/mean_terminated_length": 164.90625, "completions/min_length": 109.25, "completions/min_terminated_length": 109.25, "epoch": 0.028879250034979712, "grad_norm": 1.559190798859591, "kl": 0.28076171875, "learning_rate": 2.7572871867143204e-06, "loss": 0.0077, "num_tokens": 8082230.0, "reward": 0.19782285764813423, "reward_std": 0.23767431639134884, "rewards/code_reward/mean": 0.19782285764813423, "rewards/code_reward/std": 0.23767432384192944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.5, "completions/max_terminated_length": 329.5, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 83.5, "completions/min_terminated_length": 83.5, "epoch": 0.02899118511263467, "grad_norm": 0.9033316433119087, "kl": 0.302978515625, "learning_rate": 2.742712813285681e-06, "loss": 0.0697, "num_tokens": 8106786.0, "reward": 0.0914294570684433, "reward_std": 0.09275190159678459, "rewards/code_reward/mean": 0.0914294570684433, "rewards/code_reward/std": 0.09275190159678459, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 123.71875, "completions/mean_terminated_length": 123.71875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.02910312019028963, "grad_norm": 1.9152198651462768, "kl": 0.333984375, "learning_rate": 2.7281387456128017e-06, "loss": 0.014, "num_tokens": 8126217.0, "reward": 0.21696891635656357, "reward_std": 0.28822916746139526, "rewards/code_reward/mean": 0.21696891635656357, "rewards/code_reward/std": 0.28822918236255646, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.75, "completions/max_terminated_length": 369.75, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 73.25, "completions/min_terminated_length": 73.25, "epoch": 0.029215055267944594, "grad_norm": 0.9135668163744887, "kl": 0.24755859375, "learning_rate": 2.7135655951943716e-06, "loss": -0.0166, "num_tokens": 8150855.0, "reward": 0.03386699501425028, "reward_std": 0.06380424555391073, "rewards/code_reward/mean": 0.03386699501425028, "rewards/code_reward/std": 0.06380424555391073, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.75, "completions/max_terminated_length": 271.75, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.125, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.029326990345599553, "grad_norm": 1.6076094252791506, "kl": 0.31201171875, "learning_rate": 2.698993973490598e-06, "loss": 0.0945, "num_tokens": 8165387.0, "reward": 0.20722341747023165, "reward_std": 0.11584698176011443, "rewards/code_reward/mean": 0.20722341747023165, "rewards/code_reward/std": 0.11584698967635632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.5, "completions/max_terminated_length": 260.5, "completions/mean_length": 160.3125, "completions/mean_terminated_length": 160.3125, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.029438925423254513, "grad_norm": 1.2712990086273, "kl": 0.266357421875, "learning_rate": 2.6844244918975416e-06, "loss": 0.0136, "num_tokens": 8185445.0, "reward": 0.12302206363528967, "reward_std": 0.1178859043866396, "rewards/code_reward/mean": 0.12302206363528967, "rewards/code_reward/std": 0.11788590624928474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.75, "completions/max_terminated_length": 276.75, "completions/mean_length": 177.96875, "completions/mean_terminated_length": 177.96875, "completions/min_length": 109.25, "completions/min_terminated_length": 109.25, "epoch": 0.029550860500909472, "grad_norm": 1.5932680686482104, "kl": 0.27490234375, "learning_rate": 2.66985776172147e-06, "loss": -0.0647, "num_tokens": 8214836.0, "reward": 0.3632364124059677, "reward_std": 0.24340662360191345, "rewards/code_reward/mean": 0.3632364124059677, "rewards/code_reward/std": 0.24340663105249405, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 131.03125, "completions/mean_terminated_length": 131.03125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.02966279557856443, "grad_norm": 1.3583412717133385, "kl": 0.3232421875, "learning_rate": 2.6552943941532088e-06, "loss": 0.0688, "num_tokens": 8233101.0, "reward": 0.19121321476995945, "reward_std": 0.1444133589975536, "rewards/code_reward/mean": 0.19121321476995945, "rewards/code_reward/std": 0.14441336272284389, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.25, "completions/max_terminated_length": 195.25, "completions/mean_length": 131.9375, "completions/mean_terminated_length": 131.9375, "completions/min_length": 98.75, "completions/min_terminated_length": 98.75, "epoch": 0.029774730656219394, "grad_norm": 1.2376740863800209, "kl": 0.347900390625, "learning_rate": 2.6407350002424927e-06, "loss": -0.0064, "num_tokens": 8253363.0, "reward": 0.24439102411270142, "reward_std": 0.17834187299013138, "rewards/code_reward/mean": 0.24439102411270142, "rewards/code_reward/std": 0.17834188044071198, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.5, "completions/max_terminated_length": 177.5, "completions/mean_length": 106.03125, "completions/mean_terminated_length": 106.03125, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.029886665733874353, "grad_norm": 1.4198187332614487, "kl": 0.30126953125, "learning_rate": 2.626180190872329e-06, "loss": -0.047, "num_tokens": 8267084.0, "reward": 0.04570374824106693, "reward_std": 0.034461796283721924, "rewards/code_reward/mean": 0.04570374824106693, "rewards/code_reward/std": 0.03446180047467351, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.029998600811529313, "grad_norm": 2.766302572356958, "kl": 0.254150390625, "learning_rate": 2.611630576733372e-06, "loss": 0.0719, "num_tokens": 8285988.0, "reward": 0.20828989439178258, "reward_std": 0.1855767808156088, "rewards/code_reward/mean": 0.20828989439178258, "rewards/code_reward/std": 0.18557679950026795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 205.78125, "completions/mean_terminated_length": 205.78125, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "epoch": 0.030110535889184272, "grad_norm": 0.559152868059546, "kl": 0.216064453125, "learning_rate": 2.5970867682982885e-06, "loss": 0.0113, "num_tokens": 8315381.0, "reward": 0.01448170654475689, "reward_std": 0.03370444104075432, "rewards/code_reward/mean": 0.01448170654475689, "rewards/code_reward/std": 0.03370444104075432, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 146.375, "completions/min_length": 88.75, "completions/min_terminated_length": 88.75, "epoch": 0.030222470966839235, "grad_norm": 1.7716322097224022, "kl": 0.329833984375, "learning_rate": 2.582549375796154e-06, "loss": 0.1976, "num_tokens": 8336289.0, "reward": 0.1647916678339243, "reward_std": 0.1911229882389307, "rewards/code_reward/mean": 0.1647916678339243, "rewards/code_reward/std": 0.1911229882389307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.25, "completions/max_terminated_length": 275.25, "completions/mean_length": 184.71875, "completions/mean_terminated_length": 184.71875, "completions/min_length": 112.25, "completions/min_terminated_length": 112.25, "epoch": 0.030334406044494194, "grad_norm": 0.9190427217910049, "kl": 0.28369140625, "learning_rate": 2.568019009186841e-06, "loss": -0.014, "num_tokens": 8358944.0, "reward": 0.20673798964708112, "reward_std": 0.11309454750153236, "rewards/code_reward/mean": 0.20673798964708112, "rewards/code_reward/std": 0.11309454750153236, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.5, "completions/max_terminated_length": 260.5, "completions/mean_length": 145.5625, "completions/mean_terminated_length": 145.5625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.030446341122149154, "grad_norm": 1.575353161865142, "kl": 0.3779296875, "learning_rate": 2.5534962781354317e-06, "loss": 0.1436, "num_tokens": 8380378.0, "reward": 0.240084670484066, "reward_std": 0.27030207961797714, "rewards/code_reward/mean": 0.240084670484066, "rewards/code_reward/std": 0.27030208706855774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.25, "completions/max_terminated_length": 411.25, "completions/mean_length": 185.34375, "completions/mean_terminated_length": 185.34375, "completions/min_length": 99.25, "completions/min_terminated_length": 99.25, "epoch": 0.030558276199804113, "grad_norm": 1.529799496784454, "kl": 0.27734375, "learning_rate": 2.538981791986634e-06, "loss": -0.072, "num_tokens": 8410077.0, "reward": 0.36352282762527466, "reward_std": 0.24801481142640114, "rewards/code_reward/mean": 0.36352282762527466, "rewards/code_reward/std": 0.24801481887698174, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.5, "completions/max_terminated_length": 322.5, "completions/mean_length": 156.71875, "completions/mean_terminated_length": 156.71875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.030670211277459072, "grad_norm": 1.670315398590079, "kl": 0.28173828125, "learning_rate": 2.524476159739218e-06, "loss": -0.0316, "num_tokens": 8433564.0, "reward": 0.08395027136430144, "reward_std": 0.10782372578978539, "rewards/code_reward/mean": 0.08395027136430144, "rewards/code_reward/std": 0.10782372625544667, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 130.09375, "completions/mean_terminated_length": 130.09375, "completions/min_length": 76.75, "completions/min_terminated_length": 76.75, "epoch": 0.030782146355114035, "grad_norm": 1.8106311365726324, "kl": 0.3251953125, "learning_rate": 2.5099799900204607e-06, "loss": 0.0782, "num_tokens": 8452687.0, "reward": 0.32567203789949417, "reward_std": 0.27224994264543056, "rewards/code_reward/mean": 0.32567203789949417, "rewards/code_reward/std": 0.27224994637072086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 657.75, "completions/max_terminated_length": 223.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 111.20089340209961, "completions/min_length": 57.25, "completions/min_terminated_length": 57.25, "epoch": 0.030894081432768995, "grad_norm": 1.8321308568390302, "kl": 0.321533203125, "learning_rate": 2.4954938910606108e-06, "loss": 0.1624, "num_tokens": 8475671.0, "reward": 0.14915229193866253, "reward_std": 0.11303082318045199, "rewards/code_reward/mean": 0.14915229193866253, "rewards/code_reward/std": 0.1130308248102665, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.25, "completions/max_terminated_length": 378.25, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.031006016510423954, "grad_norm": 1.5281779015369072, "kl": 0.2308349609375, "learning_rate": 2.481018470667368e-06, "loss": 0.1693, "num_tokens": 8502299.0, "reward": 0.18173168785870075, "reward_std": 0.10828323196619749, "rewards/code_reward/mean": 0.18173168785870075, "rewards/code_reward/std": 0.10828323615714908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.5, "completions/max_terminated_length": 260.5, "completions/mean_length": 150.34375, "completions/mean_terminated_length": 150.34375, "completions/min_length": 71.75, "completions/min_terminated_length": 71.75, "epoch": 0.031117951588078913, "grad_norm": 1.6397888672992769, "kl": 0.310546875, "learning_rate": 2.4665543362003802e-06, "loss": 0.0215, "num_tokens": 8528406.0, "reward": 0.12074580090120435, "reward_std": 0.17130711488425732, "rewards/code_reward/mean": 0.12074580090120435, "rewards/code_reward/std": 0.17130712047219276, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.5, "completions/max_terminated_length": 241.5, "completions/mean_length": 120.4375, "completions/mean_terminated_length": 120.4375, "completions/min_length": 69.75, "completions/min_terminated_length": 69.75, "epoch": 0.031229886665733873, "grad_norm": 1.8254531380803452, "kl": 0.32275390625, "learning_rate": 2.4521020945457615e-06, "loss": 0.0678, "num_tokens": 8549612.0, "reward": 0.6036184206604958, "reward_std": 0.3284572381526232, "rewards/code_reward/mean": 0.6036184206604958, "rewards/code_reward/std": 0.3284572381526232, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 126.84375, "completions/mean_terminated_length": 126.84375, "completions/min_length": 56.25, "completions/min_terminated_length": 56.25, "epoch": 0.031341821743388835, "grad_norm": 2.4668184273965648, "kl": 0.4033203125, "learning_rate": 2.4376623520906255e-06, "loss": 0.1532, "num_tokens": 8569279.0, "reward": 0.18534822203218937, "reward_std": 0.146333621814847, "rewards/code_reward/mean": 0.18534822203218937, "rewards/code_reward/std": 0.1463336320593953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 150.90625, "completions/mean_terminated_length": 150.90625, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.031453756821043795, "grad_norm": 1.2334442728026884, "kl": 0.3759765625, "learning_rate": 2.4232357146976478e-06, "loss": 0.0028, "num_tokens": 8597444.0, "reward": 0.1862155646085739, "reward_std": 0.10262486711144447, "rewards/code_reward/mean": 0.1862155646085739, "rewards/code_reward/std": 0.10262487456202507, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.5, "completions/max_terminated_length": 510.5, "completions/mean_length": 210.78125, "completions/mean_terminated_length": 210.78125, "completions/min_length": 97.25, "completions/min_terminated_length": 97.25, "epoch": 0.031565691898698754, "grad_norm": 1.484546879319261, "kl": 0.309326171875, "learning_rate": 2.408822787679637e-06, "loss": -0.0366, "num_tokens": 8622829.0, "reward": 0.1472295392304659, "reward_std": 0.10856602992862463, "rewards/code_reward/mean": 0.1472295392304659, "rewards/code_reward/std": 0.10856604157015681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.25, "completions/max_terminated_length": 266.25, "completions/mean_length": 177.84375, "completions/mean_terminated_length": 177.84375, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.031677626976353714, "grad_norm": 1.6429453484339698, "kl": 0.318359375, "learning_rate": 2.3944241757741475e-06, "loss": 0.0429, "num_tokens": 8643536.0, "reward": 0.1551339291036129, "reward_std": 0.21810386329889297, "rewards/code_reward/mean": 0.1551339291036129, "rewards/code_reward/std": 0.21810387633740902, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.5, "completions/max_terminated_length": 218.5, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 90.75, "completions/min_terminated_length": 90.75, "epoch": 0.03178956205400867, "grad_norm": 1.9278006659321785, "kl": 0.294677734375, "learning_rate": 2.380040483118097e-06, "loss": -0.0661, "num_tokens": 8660110.0, "reward": 0.1566466533113271, "reward_std": 0.15316736698150635, "rewards/code_reward/mean": 0.1566466533113271, "rewards/code_reward/std": 0.1531673688441515, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.75, "completions/max_terminated_length": 200.75, "completions/mean_length": 111.15625, "completions/mean_terminated_length": 111.15625, "completions/min_length": 67.25, "completions/min_terminated_length": 67.25, "epoch": 0.03190149713166363, "grad_norm": 1.924152560934373, "kl": 0.46728515625, "learning_rate": 2.365672313222419e-06, "loss": 0.0708, "num_tokens": 8676963.0, "reward": 0.3067304156720638, "reward_std": 0.17022380698472261, "rewards/code_reward/mean": 0.3067304156720638, "rewards/code_reward/std": 0.1702238107100129, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.5, "completions/max_terminated_length": 146.5, "completions/mean_length": 88.375, "completions/mean_terminated_length": 88.375, "completions/min_length": 53.5, "completions/min_terminated_length": 53.5, "epoch": 0.0320134322093186, "grad_norm": 2.52648816592765, "kl": 0.4990234375, "learning_rate": 2.351320268946749e-06, "loss": -0.0968, "num_tokens": 8696055.0, "reward": 0.21885720640420914, "reward_std": 0.18590925447642803, "rewards/code_reward/mean": 0.21885720640420914, "rewards/code_reward/std": 0.18590926192700863, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 949.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 296.96875, "completions/mean_terminated_length": 238.07143020629883, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.03212536728697356, "grad_norm": 1.4087649689505792, "kl": 0.279296875, "learning_rate": 2.336984952474119e-06, "loss": 0.1631, "num_tokens": 8732022.0, "reward": 0.12815122242318466, "reward_std": 0.13949624670203775, "rewards/code_reward/mean": 0.12815122242318466, "rewards/code_reward/std": 0.13949625426903367, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.5, "completions/max_terminated_length": 313.5, "completions/mean_length": 165.6875, "completions/mean_terminated_length": 165.6875, "completions/min_length": 89.75, "completions/min_terminated_length": 89.75, "epoch": 0.03223730236462852, "grad_norm": 1.5911200294302021, "kl": 0.309326171875, "learning_rate": 2.322666965285697e-06, "loss": -0.0499, "num_tokens": 8752596.0, "reward": 0.2135441319551319, "reward_std": 0.1789869824424386, "rewards/code_reward/mean": 0.2135441319551319, "rewards/code_reward/std": 0.17898700083605945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.25, "completions/max_terminated_length": 487.25, "completions/mean_length": 240.5625, "completions/mean_terminated_length": 240.5625, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.03234923744228348, "grad_norm": 1.6229086799739825, "kl": 0.305908203125, "learning_rate": 2.3083669081355507e-06, "loss": 0.1546, "num_tokens": 8783550.0, "reward": 0.060625465121120214, "reward_std": 0.031893965788185596, "rewards/code_reward/mean": 0.060625465121120214, "rewards/code_reward/std": 0.03189396392554045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 736.75, "completions/max_terminated_length": 277.75, "completions/mean_length": 226.15625, "completions/mean_terminated_length": 167.17411041259766, "completions/min_length": 81.25, "completions/min_terminated_length": 81.25, "epoch": 0.032461172519938436, "grad_norm": 0.9678300330589205, "kl": 0.138671875, "learning_rate": 2.2940853810254377e-06, "loss": 0.1927, "num_tokens": 8806243.0, "reward": 0.301976312417537, "reward_std": 0.05864762840792537, "rewards/code_reward/mean": 0.301976312417537, "rewards/code_reward/std": 0.058647628873586655, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.75, "completions/max_terminated_length": 419.75, "completions/mean_length": 161.28125, "completions/mean_terminated_length": 161.28125, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.032573107597593395, "grad_norm": 2.0533336308641092, "kl": 0.43505859375, "learning_rate": 2.2798229831796313e-06, "loss": 0.0806, "num_tokens": 8830396.0, "reward": 0.08084819512441754, "reward_std": 0.043647464364767075, "rewards/code_reward/mean": 0.08084819512441754, "rewards/code_reward/std": 0.043647464364767075, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 739.75, "completions/max_terminated_length": 282.0, "completions/mean_length": 217.4375, "completions/mean_terminated_length": 157.30357360839844, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.032685042675248355, "grad_norm": 1.637832465555356, "kl": 0.2890625, "learning_rate": 2.2655803130197816e-06, "loss": 0.2231, "num_tokens": 8853858.0, "reward": 0.2264392450451851, "reward_std": 0.21030585933476686, "rewards/code_reward/mean": 0.2264392450451851, "rewards/code_reward/std": 0.21030588168650866, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.5, "completions/max_terminated_length": 349.5, "completions/mean_length": 193.40625, "completions/mean_terminated_length": 193.40625, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "epoch": 0.032796977752903314, "grad_norm": 1.574199000350919, "kl": 0.384765625, "learning_rate": 2.2513579681398034e-06, "loss": 0.0158, "num_tokens": 8878935.0, "reward": 0.2135722152888775, "reward_std": 0.17660537734627724, "rewards/code_reward/mean": 0.2135722152888775, "rewards/code_reward/std": 0.17660538339987397, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.25, "completions/max_terminated_length": 311.25, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 91.25, "completions/min_terminated_length": 91.25, "epoch": 0.03290891283055827, "grad_norm": 1.6072325452218685, "kl": 0.363037109375, "learning_rate": 2.237156545280803e-06, "loss": 0.0884, "num_tokens": 8901727.0, "reward": 0.3473220057785511, "reward_std": 0.18608891125768423, "rewards/code_reward/mean": 0.3473220057785511, "rewards/code_reward/std": 0.18608891125768423, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.75, "completions/max_terminated_length": 419.75, "completions/mean_length": 209.28125, "completions/mean_terminated_length": 209.28125, "completions/min_length": 105.75, "completions/min_terminated_length": 105.75, "epoch": 0.03302084790821324, "grad_norm": 1.6249151148932088, "kl": 0.248779296875, "learning_rate": 2.2229766403060403e-06, "loss": -0.0182, "num_tokens": 8925072.0, "reward": 0.307357229758054, "reward_std": 0.13686690758913755, "rewards/code_reward/mean": 0.307357229758054, "rewards/code_reward/std": 0.13686690386384726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 178.40625, "completions/mean_terminated_length": 178.40625, "completions/min_length": 90.25, "completions/min_terminated_length": 90.25, "epoch": 0.0331327829858682, "grad_norm": 1.6672544303984767, "kl": 0.32666015625, "learning_rate": 2.2088188481759305e-06, "loss": 0.0041, "num_tokens": 8942397.0, "reward": 0.17835952731547877, "reward_std": 0.14297430915758014, "rewards/code_reward/mean": 0.17835952731547877, "rewards/code_reward/std": 0.1429743110202253, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 168.09375, "completions/mean_terminated_length": 168.09375, "completions/min_length": 82.75, "completions/min_terminated_length": 82.75, "epoch": 0.03324471806352316, "grad_norm": 1.5915878451347125, "kl": 0.42919921875, "learning_rate": 2.194683762923073e-06, "loss": -0.0342, "num_tokens": 8967448.0, "reward": 0.23388671875, "reward_std": 0.09839868592098355, "rewards/code_reward/mean": 0.23388671875, "rewards/code_reward/std": 0.09839868592098355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1053.75, "completions/max_terminated_length": 626.25, "completions/mean_length": 331.40625, "completions/mean_terminated_length": 275.5669708251953, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "epoch": 0.03335665314117812, "grad_norm": 1.1259318778679914, "kl": 0.24853515625, "learning_rate": 2.1805719776273387e-06, "loss": 0.1031, "num_tokens": 8996029.0, "reward": 0.21752450801432133, "reward_std": 0.22587602585554123, "rewards/code_reward/mean": 0.21752450801432133, "rewards/code_reward/std": 0.22587604075670242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 810.25, "completions/max_terminated_length": 459.5, "completions/mean_length": 282.46875, "completions/mean_terminated_length": 228.1651840209961, "completions/min_length": 90.25, "completions/min_terminated_length": 90.25, "epoch": 0.03346858821883308, "grad_norm": 1.327314761495312, "kl": 0.2470703125, "learning_rate": 2.166484084390974e-06, "loss": -0.0158, "num_tokens": 9024660.0, "reward": 0.4248046875, "reward_std": 0.41644760966300964, "rewards/code_reward/mean": 0.4248046875, "rewards/code_reward/std": 0.41644763946533203, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 816.5, "completions/max_terminated_length": 452.75, "completions/mean_length": 257.34375, "completions/mean_terminated_length": 201.17857360839844, "completions/min_length": 88.5, "completions/min_terminated_length": 88.5, "epoch": 0.033580523296488037, "grad_norm": 1.2867387634028893, "kl": 0.251953125, "learning_rate": 2.1524206743137636e-06, "loss": -0.2782, "num_tokens": 9049823.0, "reward": 0.2559996712952852, "reward_std": 0.17017995577771217, "rewards/code_reward/mean": 0.2559996712952852, "rewards/code_reward/std": 0.1701799522852525, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.5, "completions/max_terminated_length": 544.5, "completions/mean_length": 296.03125, "completions/mean_terminated_length": 296.03125, "completions/min_length": 139.25, "completions/min_terminated_length": 139.25, "epoch": 0.033692458374142996, "grad_norm": 1.3498052592722694, "kl": 0.25732421875, "learning_rate": 2.1383823374682287e-06, "loss": 0.0851, "num_tokens": 9079328.0, "reward": 0.38671875, "reward_std": 0.3085732739418745, "rewards/code_reward/mean": 0.38671875, "rewards/code_reward/std": 0.3085732851177454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 205.40625, "completions/mean_terminated_length": 205.40625, "completions/min_length": 79.75, "completions/min_terminated_length": 79.75, "epoch": 0.033804393451797955, "grad_norm": 1.1786971151666847, "kl": 0.253662109375, "learning_rate": 2.124369662874868e-06, "loss": 0.0537, "num_tokens": 9103917.0, "reward": 0.1022359449416399, "reward_std": 0.1313032009638846, "rewards/code_reward/mean": 0.1022359449416399, "rewards/code_reward/std": 0.13130320748314261, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.5, "completions/max_terminated_length": 785.5, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 83.75, "completions/min_terminated_length": 83.75, "epoch": 0.033916328529452915, "grad_norm": 1.2753854412194898, "kl": 0.23779296875, "learning_rate": 2.110383238477441e-06, "loss": 0.1839, "num_tokens": 9131989.0, "reward": 0.3315134688746184, "reward_std": 0.21372198988683522, "rewards/code_reward/mean": 0.3315134688746184, "rewards/code_reward/std": 0.21372198243625462, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 834.75, "completions/max_terminated_length": 440.25, "completions/mean_length": 297.6875, "completions/mean_terminated_length": 241.50446701049805, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.03402826360710788, "grad_norm": 0.9686437804358383, "kl": 0.25732421875, "learning_rate": 2.096423651118305e-06, "loss": 0.0919, "num_tokens": 9155547.0, "reward": 0.18424479104578495, "reward_std": 0.2077017817646265, "rewards/code_reward/mean": 0.18424479104578495, "rewards/code_reward/std": 0.20770180504769087, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 807.0, "completions/max_terminated_length": 623.5, "completions/mean_length": 361.53125, "completions/mean_terminated_length": 315.49554443359375, "completions/min_length": 142.25, "completions/min_terminated_length": 142.25, "epoch": 0.03414019868476284, "grad_norm": 1.203308108515461, "kl": 0.24169921875, "learning_rate": 2.082491486513788e-06, "loss": 0.0801, "num_tokens": 9183796.0, "reward": 0.22409930732101202, "reward_std": 0.2232498861849308, "rewards/code_reward/mean": 0.22409930732101202, "rewards/code_reward/std": 0.22324990667402744, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.25, "completions/max_terminated_length": 560.25, "completions/mean_length": 283.53125, "completions/mean_terminated_length": 283.53125, "completions/min_length": 173.75, "completions/min_terminated_length": 173.75, "epoch": 0.0342521337624178, "grad_norm": 1.0443749007299015, "kl": 0.22998046875, "learning_rate": 2.0685873292296116e-06, "loss": -0.0535, "num_tokens": 9212077.0, "reward": 0.3671575216576457, "reward_std": 0.18197334744036198, "rewards/code_reward/mean": 0.3671575216576457, "rewards/code_reward/std": 0.18197335489094257, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 959.25, "completions/max_terminated_length": 559.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 283.05357360839844, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.03436406884007276, "grad_norm": 0.9280325752065158, "kl": 0.26708984375, "learning_rate": 2.054711762656369e-06, "loss": 0.0166, "num_tokens": 9245945.0, "reward": 0.20142045244574547, "reward_std": 0.1855016816407442, "rewards/code_reward/mean": 0.20142045244574547, "rewards/code_reward/std": 0.18550169840455055, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 917.75, "completions/max_terminated_length": 533.5, "completions/mean_length": 328.03125, "completions/mean_terminated_length": 271.48661041259766, "completions/min_length": 96.25, "completions/min_terminated_length": 96.25, "epoch": 0.03447600391772772, "grad_norm": 1.110612410740818, "kl": 0.26904296875, "learning_rate": 2.040865368985044e-06, "loss": 0.1496, "num_tokens": 9271114.0, "reward": 0.2448565848171711, "reward_std": 0.2629811018705368, "rewards/code_reward/mean": 0.2448565848171711, "rewards/code_reward/std": 0.26298110000789165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.5, "completions/max_terminated_length": 690.5, "completions/mean_length": 286.53125, "completions/mean_terminated_length": 286.53125, "completions/min_length": 103.25, "completions/min_terminated_length": 103.25, "epoch": 0.03458793899538268, "grad_norm": 0.9114432475860804, "kl": 0.25927734375, "learning_rate": 2.027048729182583e-06, "loss": 0.0919, "num_tokens": 9294987.0, "reward": 0.30750996619462967, "reward_std": 0.21396427508443594, "rewards/code_reward/mean": 0.30750996619462967, "rewards/code_reward/std": 0.21396427601575851, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.75, "completions/max_terminated_length": 448.75, "completions/mean_length": 235.46875, "completions/mean_terminated_length": 235.46875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.03469987407303764, "grad_norm": 1.1384514018728271, "kl": 0.2802734375, "learning_rate": 2.0132624229675205e-06, "loss": 0.0654, "num_tokens": 9320514.0, "reward": 0.31562499701976776, "reward_std": 0.12151388870552182, "rewards/code_reward/mean": 0.31562499701976776, "rewards/code_reward/std": 0.12151389149948955, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1206.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 407.65625, "completions/mean_terminated_length": 296.8125, "completions/min_length": 103.75, "completions/min_terminated_length": 103.75, "epoch": 0.034811809150692596, "grad_norm": 1.0596592897748647, "kl": 0.222412109375, "learning_rate": 1.9995070287856546e-06, "loss": 0.1233, "num_tokens": 9352039.0, "reward": 0.10777858644723892, "reward_std": 0.1648613102734089, "rewards/code_reward/mean": 0.10777858644723892, "rewards/code_reward/std": 0.1648613139986992, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.25, "completions/max_terminated_length": 472.25, "completions/mean_length": 266.09375, "completions/mean_terminated_length": 266.09375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.034923744228347556, "grad_norm": 1.3278661225451498, "kl": 0.267333984375, "learning_rate": 1.985783123785774e-06, "loss": 0.1761, "num_tokens": 9375930.0, "reward": 0.6153363855555654, "reward_std": 0.09626698028296232, "rewards/code_reward/mean": 0.6153363855555654, "rewards/code_reward/std": 0.09626698028296232, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.25, "completions/max_terminated_length": 448.25, "completions/mean_length": 285.46875, "completions/mean_terminated_length": 285.46875, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.035035679306002515, "grad_norm": 1.2384888571370045, "kl": 0.280029296875, "learning_rate": 1.9720912837954486e-06, "loss": 0.0208, "num_tokens": 9399217.0, "reward": 0.26853298489004374, "reward_std": 0.2630241848528385, "rewards/code_reward/mean": 0.26853298489004374, "rewards/code_reward/std": 0.26302417647093534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 319.59375, "completions/mean_terminated_length": 319.59375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.03514761438365748, "grad_norm": 1.5140046934528983, "kl": 0.2685546875, "learning_rate": 1.958432083296862e-06, "loss": 0.1386, "num_tokens": 9427972.0, "reward": 0.3911227434873581, "reward_std": 0.2716307928785682, "rewards/code_reward/mean": 0.3911227434873581, "rewards/code_reward/std": 0.2716307919472456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.25, "completions/max_terminated_length": 329.25, "completions/mean_length": 162.96875, "completions/mean_terminated_length": 162.96875, "completions/min_length": 85.75, "completions/min_terminated_length": 85.75, "epoch": 0.03525954946131244, "grad_norm": 1.4191024334529987, "kl": 0.25927734375, "learning_rate": 1.9448060954027093e-06, "loss": 0.0713, "num_tokens": 9447267.0, "reward": 0.5250866562128067, "reward_std": 0.19822602486237884, "rewards/code_reward/mean": 0.5250866562128067, "rewards/code_reward/std": 0.19822603231295943, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.5, "completions/max_terminated_length": 490.5, "completions/mean_length": 262.53125, "completions/mean_terminated_length": 262.53125, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.0353714845389674, "grad_norm": 1.2448753208331158, "kl": 0.245361328125, "learning_rate": 1.931213891832153e-06, "loss": 0.251, "num_tokens": 9471212.0, "reward": 0.19074449688196182, "reward_std": 0.07534042606130242, "rewards/code_reward/mean": 0.19074449688196182, "rewards/code_reward/std": 0.07534042652696371, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.25, "completions/max_terminated_length": 464.25, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 93.25, "completions/min_terminated_length": 93.25, "epoch": 0.03548341961662236, "grad_norm": 1.3212499853268918, "kl": 0.286865234375, "learning_rate": 1.9176560428868336e-06, "loss": -0.0392, "num_tokens": 9494912.0, "reward": 0.23464674223214388, "reward_std": 0.1325080880196765, "rewards/code_reward/mean": 0.23464674223214388, "rewards/code_reward/std": 0.13250808895099908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 814.5, "completions/max_terminated_length": 676.5, "completions/mean_length": 313.59375, "completions/mean_terminated_length": 265.1071472167969, "completions/min_length": 79.5, "completions/min_terminated_length": 79.5, "epoch": 0.03559535469427732, "grad_norm": 1.7038540009433336, "kl": 0.28857421875, "learning_rate": 1.9041331174269373e-06, "loss": 0.4071, "num_tokens": 9524787.0, "reward": 0.3640685440041125, "reward_std": 0.17312923236750066, "rewards/code_reward/mean": 0.3640685440041125, "rewards/code_reward/std": 0.173129228875041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 232.84375, "completions/mean_terminated_length": 232.84375, "completions/min_length": 120.25, "completions/min_terminated_length": 120.25, "epoch": 0.03570728977193228, "grad_norm": 1.3967647406257568, "kl": 0.264892578125, "learning_rate": 1.8906456828473341e-06, "loss": 0.0554, "num_tokens": 9548390.0, "reward": 0.19295948650687933, "reward_std": 0.11571824550628662, "rewards/code_reward/mean": 0.19295948650687933, "rewards/code_reward/std": 0.1157182501628995, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 901.25, "completions/max_terminated_length": 495.0, "completions/mean_length": 317.78125, "completions/mean_terminated_length": 262.4151840209961, "completions/min_length": 105.25, "completions/min_terminated_length": 105.25, "epoch": 0.03581922484958724, "grad_norm": 1.2881984024223554, "kl": 0.2744140625, "learning_rate": 1.8771943050537656e-06, "loss": -0.0248, "num_tokens": 9578255.0, "reward": 0.0896820523776114, "reward_std": 0.08542403136380017, "rewards/code_reward/mean": 0.0896820523776114, "rewards/code_reward/std": 0.08542404044419527, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1355.5, "completions/max_terminated_length": 495.5, "completions/mean_length": 346.8125, "completions/mean_terminated_length": 230.4151840209961, "completions/min_length": 79.5, "completions/min_terminated_length": 79.5, "epoch": 0.0359311599272422, "grad_norm": 1.2010225217377024, "kl": 0.206298828125, "learning_rate": 1.8637795484391046e-06, "loss": 0.562, "num_tokens": 9612289.0, "reward": 0.38001057505607605, "reward_std": 0.18961793556809425, "rewards/code_reward/mean": 0.38001057505607605, "rewards/code_reward/std": 0.18961793649941683, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 219.4375, "completions/mean_terminated_length": 219.4375, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "epoch": 0.036043095004897156, "grad_norm": 1.188208321486219, "kl": 0.26806640625, "learning_rate": 1.8504019758596698e-06, "loss": -0.046, "num_tokens": 9634663.0, "reward": 0.1651124432682991, "reward_std": 0.14856510423123837, "rewards/code_reward/mean": 0.1651124432682991, "rewards/code_reward/std": 0.14856510609388351, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.25, "completions/max_terminated_length": 482.25, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.03615503008255212, "grad_norm": 1.1155401373073985, "kl": 0.293701171875, "learning_rate": 1.8370621486116163e-06, "loss": 0.172, "num_tokens": 9655579.0, "reward": 0.074991176254116, "reward_std": 0.06526870373636484, "rewards/code_reward/mean": 0.074991176254116, "rewards/code_reward/std": 0.06526870559900999, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.75, "completions/max_terminated_length": 332.75, "completions/mean_length": 169.78125, "completions/mean_terminated_length": 169.78125, "completions/min_length": 57.5, "completions/min_terminated_length": 57.5, "epoch": 0.03626696516020708, "grad_norm": 1.4175182217669504, "kl": 0.36181640625, "learning_rate": 1.823760626407377e-06, "loss": 0.0677, "num_tokens": 9678716.0, "reward": 0.5682446430437267, "reward_std": 0.253071456681937, "rewards/code_reward/mean": 0.5682446430437267, "rewards/code_reward/std": 0.2530714562162757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 195.03125, "completions/mean_terminated_length": 195.03125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.03637890023786204, "grad_norm": 1.466909812259168, "kl": 0.302490234375, "learning_rate": 1.8104979673521838e-06, "loss": 0.0551, "num_tokens": 9697405.0, "reward": 0.3620302341878414, "reward_std": 0.24883326888084412, "rewards/code_reward/mean": 0.3620302341878414, "rewards/code_reward/std": 0.2488332763314247, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.75, "completions/max_terminated_length": 393.75, "completions/mean_length": 216.21875, "completions/mean_terminated_length": 216.21875, "completions/min_length": 80.5, "completions/min_terminated_length": 80.5, "epoch": 0.036490835315517, "grad_norm": 1.6340135294611648, "kl": 0.302978515625, "learning_rate": 1.7972747279206482e-06, "loss": 0.0425, "num_tokens": 9716260.0, "reward": 0.2104739099740982, "reward_std": 0.11729209683835506, "rewards/code_reward/mean": 0.2104739099740982, "rewards/code_reward/std": 0.11729210242629051, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 241.09375, "completions/mean_terminated_length": 241.09375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.03660277039317196, "grad_norm": 1.1688628621437198, "kl": 0.2314453125, "learning_rate": 1.7840914629334122e-06, "loss": -0.0012, "num_tokens": 9739031.0, "reward": 0.16952253691852093, "reward_std": 0.042415026342496276, "rewards/code_reward/mean": 0.16952253691852093, "rewards/code_reward/std": 0.04241502704098821, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.5, "completions/max_terminated_length": 523.5, "completions/mean_length": 207.65625, "completions/mean_terminated_length": 207.65625, "completions/min_length": 90.5, "completions/min_terminated_length": 90.5, "epoch": 0.03671470547082692, "grad_norm": 1.0941067328247338, "kl": 0.2578125, "learning_rate": 1.7709487255338731e-06, "loss": 0.0704, "num_tokens": 9761348.0, "reward": 0.21878245938569307, "reward_std": 0.10285742627456784, "rewards/code_reward/mean": 0.21878245938569307, "rewards/code_reward/std": 0.10285743046551943, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.5, "completions/max_terminated_length": 244.5, "completions/mean_length": 134.71875, "completions/mean_terminated_length": 134.71875, "completions/min_length": 77.75, "completions/min_terminated_length": 77.75, "epoch": 0.03682664054848188, "grad_norm": 1.9134234308602869, "kl": 0.34326171875, "learning_rate": 1.7578470671649684e-06, "loss": 0.0705, "num_tokens": 9781267.0, "reward": 0.17752246744930744, "reward_std": 0.12675740150734782, "rewards/code_reward/mean": 0.17752246744930744, "rewards/code_reward/std": 0.12675740336999297, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 795.0, "completions/max_terminated_length": 382.5, "completions/mean_length": 250.09375, "completions/mean_terminated_length": 192.71875381469727, "completions/min_length": 71.5, "completions/min_terminated_length": 71.5, "epoch": 0.03693857562613684, "grad_norm": 1.5415476088332252, "kl": 0.39501953125, "learning_rate": 1.744787037546045e-06, "loss": 0.2168, "num_tokens": 9808326.0, "reward": 0.21277573192492127, "reward_std": 0.23475970514118671, "rewards/code_reward/mean": 0.21277573192492127, "rewards/code_reward/std": 0.234759708866477, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.75, "completions/max_terminated_length": 282.75, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 96.25, "completions/min_terminated_length": 96.25, "epoch": 0.0370505107037918, "grad_norm": 1.7106509716957057, "kl": 0.269775390625, "learning_rate": 1.731769184649788e-06, "loss": -0.0451, "num_tokens": 9829634.0, "reward": 0.09405737672932446, "reward_std": 0.1773677747696638, "rewards/code_reward/mean": 0.09405737672932446, "rewards/code_reward/std": 0.17736777663230896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.75, "completions/max_terminated_length": 274.75, "completions/mean_length": 149.90625, "completions/mean_terminated_length": 149.90625, "completions/min_length": 60.5, "completions/min_terminated_length": 60.5, "epoch": 0.037162445781446764, "grad_norm": 2.6731683502607013, "kl": 0.664306640625, "learning_rate": 1.7187940546792325e-06, "loss": 0.0639, "num_tokens": 9848823.0, "reward": 0.0996803566813469, "reward_std": 0.07073929067701101, "rewards/code_reward/mean": 0.0996803566813469, "rewards/code_reward/std": 0.0707392911426723, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.75, "completions/max_terminated_length": 291.75, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 67.5, "completions/min_terminated_length": 67.5, "epoch": 0.03727438085910172, "grad_norm": 1.6268477182520276, "kl": 0.28955078125, "learning_rate": 1.7058621920448465e-06, "loss": 0.0592, "num_tokens": 9869263.0, "reward": 0.15218693669885397, "reward_std": 0.21367748617194593, "rewards/code_reward/mean": 0.15218693669885397, "rewards/code_reward/std": 0.21367749362252653, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.25, "completions/max_terminated_length": 377.25, "completions/mean_length": 193.34375, "completions/mean_terminated_length": 193.34375, "completions/min_length": 74.25, "completions/min_terminated_length": 74.25, "epoch": 0.03738631593675668, "grad_norm": 1.582329034740645, "kl": 0.268798828125, "learning_rate": 1.6929741393416855e-06, "loss": -0.0098, "num_tokens": 9902154.0, "reward": 0.19454657658934593, "reward_std": 0.20047161541879177, "rewards/code_reward/mean": 0.19454657658934593, "rewards/code_reward/std": 0.20047162100672722, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.75, "completions/max_terminated_length": 293.75, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 79.5, "completions/min_terminated_length": 79.5, "epoch": 0.03749825101441164, "grad_norm": 1.5771763046671186, "kl": 0.250244140625, "learning_rate": 1.6801304373266286e-06, "loss": 0.0037, "num_tokens": 9921964.0, "reward": 0.20569872483611107, "reward_std": 0.1606605793349445, "rewards/code_reward/mean": 0.20569872483611107, "rewards/code_reward/std": 0.16066057654097676, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.75, "completions/max_terminated_length": 485.75, "completions/mean_length": 203.8125, "completions/mean_terminated_length": 203.8125, "completions/min_length": 88.75, "completions/min_terminated_length": 88.75, "epoch": 0.0376101860920666, "grad_norm": 1.7807442268855076, "kl": 0.286865234375, "learning_rate": 1.667331624895689e-06, "loss": 0.0622, "num_tokens": 9952862.0, "reward": 0.1456711394712329, "reward_std": 0.22775039146654308, "rewards/code_reward/mean": 0.1456711394712329, "rewards/code_reward/std": 0.22775039146654308, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.5, "completions/max_terminated_length": 295.5, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 63.75, "completions/min_terminated_length": 63.75, "epoch": 0.03772212116972156, "grad_norm": 1.640447948869111, "kl": 0.3310546875, "learning_rate": 1.6545782390614037e-06, "loss": 0.0577, "num_tokens": 9970606.0, "reward": 0.47745162434875965, "reward_std": 0.23838305938988924, "rewards/code_reward/mean": 0.47745162434875965, "rewards/code_reward/std": 0.23838307429105043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 127.84375, "completions/mean_terminated_length": 127.84375, "completions/min_length": 71.75, "completions/min_terminated_length": 71.75, "epoch": 0.03783405624737652, "grad_norm": 2.018865032840707, "kl": 0.361328125, "learning_rate": 1.6418708149302992e-06, "loss": 0.0511, "num_tokens": 9992753.0, "reward": 0.4375, "reward_std": 0.17353582940995693, "rewards/code_reward/mean": 0.4375, "rewards/code_reward/std": 0.17353583686053753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 118.28125, "completions/mean_terminated_length": 118.28125, "completions/min_length": 63.75, "completions/min_terminated_length": 63.75, "epoch": 0.03794599132503148, "grad_norm": 1.3053097026379892, "kl": 0.3037109375, "learning_rate": 1.6292098856804423e-06, "loss": 0.0182, "num_tokens": 10005258.0, "reward": 0.3365098312497139, "reward_std": 0.20095888897776604, "rewards/code_reward/mean": 0.3365098312497139, "rewards/code_reward/std": 0.20095889456570148, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 133.25, "completions/min_terminated_length": 133.25, "epoch": 0.03805792640268644, "grad_norm": 0.9354111744654126, "kl": 0.2366943359375, "learning_rate": 1.6165959825390661e-06, "loss": 0.0313, "num_tokens": 10030994.0, "reward": 0.05368073424324393, "reward_std": 0.018324243370443583, "rewards/code_reward/mean": 0.05368073424324393, "rewards/code_reward/std": 0.018324245465919375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.25, "completions/max_terminated_length": 428.25, "completions/mean_length": 181.5625, "completions/mean_terminated_length": 181.5625, "completions/min_length": 89.75, "completions/min_terminated_length": 89.75, "epoch": 0.038169861480341405, "grad_norm": 1.6930152926719917, "kl": 0.37255859375, "learning_rate": 1.604029634760284e-06, "loss": 0.0426, "num_tokens": 10053388.0, "reward": 0.24092174973338842, "reward_std": 0.18441250827163458, "rewards/code_reward/mean": 0.24092174973338842, "rewards/code_reward/std": 0.18441250827163458, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.25, "completions/max_terminated_length": 191.25, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 50.25, "completions/min_terminated_length": 50.25, "epoch": 0.038281796557996364, "grad_norm": 2.4426044186229556, "kl": 0.513427734375, "learning_rate": 1.59151136960288e-06, "loss": -0.1329, "num_tokens": 10074740.0, "reward": 0.43505216389894485, "reward_std": 0.09191552549600601, "rewards/code_reward/mean": 0.43505216389894485, "rewards/code_reward/std": 0.09191552549600601, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.5, "completions/max_terminated_length": 375.5, "completions/mean_length": 191.9375, "completions/mean_terminated_length": 191.9375, "completions/min_length": 79.75, "completions/min_terminated_length": 79.75, "epoch": 0.038393731635651324, "grad_norm": 1.4272881181440114, "kl": 0.2998046875, "learning_rate": 1.5790417123081903e-06, "loss": 0.0731, "num_tokens": 10095146.0, "reward": 0.40253712981939316, "reward_std": 0.4054878391325474, "rewards/code_reward/mean": 0.40253712981939316, "rewards/code_reward/std": 0.40548786148428917, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 190.59375, "completions/mean_terminated_length": 190.59375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.03850566671330628, "grad_norm": 1.5919937138741604, "kl": 0.22802734375, "learning_rate": 1.5666211860780583e-06, "loss": 0.1869, "num_tokens": 10115245.0, "reward": 0.11901041585952044, "reward_std": 0.06176098808646202, "rewards/code_reward/mean": 0.11901041585952044, "rewards/code_reward/std": 0.06176098808646202, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 709.5, "completions/max_terminated_length": 495.5, "completions/mean_length": 248.71875, "completions/mean_terminated_length": 197.40625762939453, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.03861760179096124, "grad_norm": 1.8660163924674529, "kl": 0.5517578125, "learning_rate": 1.5542503120528918e-06, "loss": 0.1448, "num_tokens": 10142828.0, "reward": 0.4072798676788807, "reward_std": 0.128664406016469, "rewards/code_reward/mean": 0.4072798676788807, "rewards/code_reward/std": 0.128664406016469, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0387295368686162, "grad_norm": 1.2237817773466955, "kl": 0.3408203125, "learning_rate": 1.5419296092897866e-06, "loss": 0.1399, "num_tokens": 10168664.0, "reward": 0.02313591120764613, "reward_std": 0.02397587802261114, "rewards/code_reward/mean": 0.02313591120764613, "rewards/code_reward/std": 0.023975879419595003, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.75, "completions/max_terminated_length": 286.75, "completions/mean_length": 148.8125, "completions/mean_terminated_length": 148.8125, "completions/min_length": 71.5, "completions/min_terminated_length": 71.5, "epoch": 0.03884147194627116, "grad_norm": 2.0973241700512655, "kl": 0.444580078125, "learning_rate": 1.529659594740755e-06, "loss": 0.0837, "num_tokens": 10185306.0, "reward": 0.3948034793138504, "reward_std": 0.1760760466568172, "rewards/code_reward/mean": 0.3948034793138504, "rewards/code_reward/std": 0.17607605503872037, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 158.78125, "completions/mean_terminated_length": 158.78125, "completions/min_length": 66.75, "completions/min_terminated_length": 66.75, "epoch": 0.03895340702392612, "grad_norm": 1.4572287618545743, "kl": 0.26953125, "learning_rate": 1.5174407832310338e-06, "loss": 0.0326, "num_tokens": 10203115.0, "reward": 0.4470205195248127, "reward_std": 0.19961272552609444, "rewards/code_reward/mean": 0.4470205195248127, "rewards/code_reward/std": 0.19961273297667503, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 160.59375, "completions/mean_terminated_length": 160.59375, "completions/min_length": 91.75, "completions/min_terminated_length": 91.75, "epoch": 0.03906534210158108, "grad_norm": 1.587914542250274, "kl": 0.302978515625, "learning_rate": 1.5052736874374815e-06, "loss": 0.0176, "num_tokens": 10226750.0, "reward": 0.1811899826861918, "reward_std": 0.15240496955811977, "rewards/code_reward/mean": 0.1811899826861918, "rewards/code_reward/std": 0.15240497328341007, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.5, "completions/max_terminated_length": 255.5, "completions/mean_length": 134.0625, "completions/mean_terminated_length": 134.0625, "completions/min_length": 62.25, "completions/min_terminated_length": 62.25, "epoch": 0.039177277179236046, "grad_norm": 2.127239539473459, "kl": 0.37744140625, "learning_rate": 1.4931588178670695e-06, "loss": 0.0222, "num_tokens": 10248072.0, "reward": 0.3137185089290142, "reward_std": 0.06511987652629614, "rewards/code_reward/mean": 0.3137185089290142, "rewards/code_reward/std": 0.06511988304555416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.25, "completions/max_terminated_length": 295.25, "completions/mean_length": 149.15625, "completions/mean_terminated_length": 149.15625, "completions/min_length": 61.25, "completions/min_terminated_length": 61.25, "epoch": 0.039289212256891005, "grad_norm": 1.4510463271520762, "kl": 0.26611328125, "learning_rate": 1.4810966828354605e-06, "loss": 0.1718, "num_tokens": 10270941.0, "reward": 0.18543480592779815, "reward_std": 0.12552618235349655, "rewards/code_reward/mean": 0.18543480592779815, "rewards/code_reward/std": 0.12552619352936745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.25, "completions/max_terminated_length": 232.25, "completions/mean_length": 142.46875, "completions/mean_terminated_length": 142.46875, "completions/min_length": 60.5, "completions/min_terminated_length": 60.5, "epoch": 0.039401147334545965, "grad_norm": 2.0984464768903726, "kl": 0.345703125, "learning_rate": 1.469087788445684e-06, "loss": -0.0558, "num_tokens": 10291156.0, "reward": 0.22514494694769382, "reward_std": 0.23393048718571663, "rewards/code_reward/mean": 0.22514494694769382, "rewards/code_reward/std": 0.23393050953745842, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.5, "completions/max_terminated_length": 250.5, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 71.5, "completions/min_terminated_length": 71.5, "epoch": 0.039513082412200924, "grad_norm": 1.6718356673761792, "kl": 0.290771484375, "learning_rate": 1.4571326385668965e-06, "loss": -0.0597, "num_tokens": 10315236.0, "reward": 0.3922019712626934, "reward_std": 0.3044360801577568, "rewards/code_reward/mean": 0.3922019712626934, "rewards/code_reward/std": 0.3044360838830471, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.5, "completions/max_terminated_length": 239.5, "completions/mean_length": 131.21875, "completions/mean_terminated_length": 131.21875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.039625017489855884, "grad_norm": 1.884602182380735, "kl": 0.362060546875, "learning_rate": 1.4452317348132434e-06, "loss": 0.1699, "num_tokens": 10342059.0, "reward": 0.2568647051230073, "reward_std": 0.057626438327133656, "rewards/code_reward/mean": 0.2568647051230073, "rewards/code_reward/std": 0.05762644065544009, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 216.78125, "completions/mean_terminated_length": 216.78125, "completions/min_length": 93.25, "completions/min_terminated_length": 93.25, "epoch": 0.03973695256751084, "grad_norm": 1.5608540814114584, "kl": 0.2548828125, "learning_rate": 1.4333855765228104e-06, "loss": 0.0906, "num_tokens": 10365764.0, "reward": 0.1356297740712762, "reward_std": 0.07451130566187203, "rewards/code_reward/mean": 0.1356297740712762, "rewards/code_reward/std": 0.07451130612753332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.25, "completions/max_terminated_length": 319.25, "completions/mean_length": 133.78125, "completions/mean_terminated_length": 133.78125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.0398488876451658, "grad_norm": 1.8748645260654178, "kl": 0.329833984375, "learning_rate": 1.421594660736675e-06, "loss": -0.0276, "num_tokens": 10390429.0, "reward": 0.4849093444645405, "reward_std": 0.17599604558199644, "rewards/code_reward/mean": 0.4849093444645405, "rewards/code_reward/std": 0.17599604465067387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.5, "completions/max_terminated_length": 256.5, "completions/mean_length": 138.90625, "completions/mean_terminated_length": 138.90625, "completions/min_length": 55.25, "completions/min_terminated_length": 55.25, "epoch": 0.03996082272282076, "grad_norm": 1.9096248965574965, "kl": 0.291015625, "learning_rate": 1.4098594821780476e-06, "loss": -0.0702, "num_tokens": 10411850.0, "reward": 0.1599155543372035, "reward_std": 0.14085367415100336, "rewards/code_reward/mean": 0.1599155543372035, "rewards/code_reward/std": 0.14085367461666465, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 64.25, "completions/min_terminated_length": 64.25, "epoch": 0.04007275780047572, "grad_norm": 0.9256594634997231, "kl": 0.1795654296875, "learning_rate": 1.3981805332315174e-06, "loss": 0.0024, "num_tokens": 10434984.0, "reward": 0.3102440594229847, "reward_std": 0.08654948882758617, "rewards/code_reward/mean": 0.3102440594229847, "rewards/code_reward/std": 0.08654948882758617, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.5, "completions/max_terminated_length": 166.5, "completions/mean_length": 91.75, "completions/mean_terminated_length": 91.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.04018469287813069, "grad_norm": 1.8840891544197063, "kl": 0.41650390625, "learning_rate": 1.3865583039223929e-06, "loss": -0.1494, "num_tokens": 10457064.0, "reward": 0.215488045476377, "reward_std": 0.08538101147860289, "rewards/code_reward/mean": 0.215488045476377, "rewards/code_reward/std": 0.08538101892918348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.25, "completions/max_terminated_length": 268.25, "completions/mean_length": 169.90625, "completions/mean_terminated_length": 169.90625, "completions/min_length": 92.25, "completions/min_terminated_length": 92.25, "epoch": 0.04029662795578565, "grad_norm": 1.543399169236028, "kl": 0.379638671875, "learning_rate": 1.374993281896137e-06, "loss": -0.0768, "num_tokens": 10481869.0, "reward": 0.22161551751196384, "reward_std": 0.23532075341790915, "rewards/code_reward/mean": 0.22161551751196384, "rewards/code_reward/std": 0.2353207627311349, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.5, "completions/max_terminated_length": 324.5, "completions/mean_length": 172.65625, "completions/mean_terminated_length": 172.65625, "completions/min_length": 72.5, "completions/min_terminated_length": 72.5, "epoch": 0.040408563033440606, "grad_norm": 1.4416318985457022, "kl": 0.3037109375, "learning_rate": 1.3634859523979134e-06, "loss": -0.0104, "num_tokens": 10507626.0, "reward": 0.19733425695449114, "reward_std": 0.2422337755560875, "rewards/code_reward/mean": 0.19733425695449114, "rewards/code_reward/std": 0.2422337755560875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 132.1875, "completions/mean_terminated_length": 132.1875, "completions/min_length": 57.5, "completions/min_terminated_length": 57.5, "epoch": 0.040520498111095565, "grad_norm": 1.7918502710892381, "kl": 0.283935546875, "learning_rate": 1.3520367982522208e-06, "loss": 0.0692, "num_tokens": 10528088.0, "reward": 0.31125493720173836, "reward_std": 0.22952541639097035, "rewards/code_reward/mean": 0.31125493720173836, "rewards/code_reward/std": 0.22952541639097035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.75, "completions/max_terminated_length": 342.75, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 65.5, "completions/min_terminated_length": 65.5, "epoch": 0.040632433188750525, "grad_norm": 1.5949676136183217, "kl": 0.316162109375, "learning_rate": 1.3406462998426358e-06, "loss": -0.0073, "num_tokens": 10547284.0, "reward": 0.33256023190915585, "reward_std": 0.23680819105356932, "rewards/code_reward/mean": 0.33256023190915585, "rewards/code_reward/std": 0.2368081919848919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.75, "completions/max_terminated_length": 246.75, "completions/mean_length": 127.90625, "completions/mean_terminated_length": 127.90625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.040744368266405484, "grad_norm": 1.889587616740625, "kl": 0.40771484375, "learning_rate": 1.3293149350916595e-06, "loss": -0.0672, "num_tokens": 10564489.0, "reward": 0.3066699914634228, "reward_std": 0.09056703024543822, "rewards/code_reward/mean": 0.3066699914634228, "rewards/code_reward/std": 0.09056703303940594, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 926.75, "completions/max_terminated_length": 492.75, "completions/mean_length": 281.25, "completions/mean_terminated_length": 221.99107360839844, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.04085630334406044, "grad_norm": 1.1530140221693261, "kl": 0.166748046875, "learning_rate": 1.3180431794406623e-06, "loss": 0.5007, "num_tokens": 10590441.0, "reward": 0.25817783176898956, "reward_std": 0.1975763700902462, "rewards/code_reward/mean": 0.25817783176898956, "rewards/code_reward/std": 0.1975763738155365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 214.03125, "completions/mean_terminated_length": 214.03125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.0409682384217154, "grad_norm": 1.5427579182120241, "kl": 0.2431640625, "learning_rate": 1.3068315058299358e-06, "loss": 0.0483, "num_tokens": 10611458.0, "reward": 0.2973039257340133, "reward_std": 0.24095657613361254, "rewards/code_reward/mean": 0.2973039257340133, "rewards/code_reward/std": 0.2409565910929814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 169.53125, "completions/mean_terminated_length": 169.53125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.04108017349937036, "grad_norm": 1.5032805435473915, "kl": 0.297119140625, "learning_rate": 1.2956803846788503e-06, "loss": 0.0369, "num_tokens": 10640499.0, "reward": 0.21608419064432383, "reward_std": 0.08488713996484876, "rewards/code_reward/mean": 0.21608419064432383, "rewards/code_reward/std": 0.08488714415580034, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 154.53125, "completions/mean_terminated_length": 154.53125, "completions/min_length": 63.5, "completions/min_terminated_length": 63.5, "epoch": 0.04119210857702533, "grad_norm": 1.2609885337673685, "kl": 0.3779296875, "learning_rate": 1.284590283866116e-06, "loss": -0.103, "num_tokens": 10665612.0, "reward": 0.6632775068283081, "reward_std": 0.22238866239786148, "rewards/code_reward/mean": 0.6632775068283081, "rewards/code_reward/std": 0.22238866239786148, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.25, "completions/max_terminated_length": 287.25, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.04130404365468029, "grad_norm": 1.6198013908337032, "kl": 0.283203125, "learning_rate": 1.2735616687101518e-06, "loss": 0.0275, "num_tokens": 10687588.0, "reward": 0.04903295123949647, "reward_std": 0.02168478211387992, "rewards/code_reward/mean": 0.04903295123949647, "rewards/code_reward/std": 0.02168478397652507, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 76.5, "completions/min_terminated_length": 76.5, "epoch": 0.04141597873233525, "grad_norm": 0.9528743354172768, "kl": 0.283203125, "learning_rate": 1.2625950019495614e-06, "loss": 0.0836, "num_tokens": 10710032.0, "reward": 0.17378074233420193, "reward_std": 0.162479427177459, "rewards/code_reward/mean": 0.17378074233420193, "rewards/code_reward/std": 0.16247944394126534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.75, "completions/max_terminated_length": 193.75, "completions/mean_length": 113.15625, "completions/mean_terminated_length": 113.15625, "completions/min_length": 53.25, "completions/min_terminated_length": 53.25, "epoch": 0.041527913809990206, "grad_norm": 1.5988336537399075, "kl": 0.29150390625, "learning_rate": 1.251690743723718e-06, "loss": -0.0146, "num_tokens": 10732597.0, "reward": 0.6325892880558968, "reward_std": 0.16140316799283028, "rewards/code_reward/mean": 0.6325892880558968, "rewards/code_reward/std": 0.16140317544341087, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 107.4375, "completions/mean_terminated_length": 107.4375, "completions/min_length": 54.5, "completions/min_terminated_length": 54.5, "epoch": 0.041639848887645166, "grad_norm": 1.5312147706022616, "kl": 0.34375, "learning_rate": 1.2408493515534581e-06, "loss": 0.0191, "num_tokens": 10749987.0, "reward": 0.47150277020409703, "reward_std": 0.04196681221947074, "rewards/code_reward/mean": 0.47150277020409703, "rewards/code_reward/std": 0.04196681268513203, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.5, "completions/max_terminated_length": 254.5, "completions/mean_length": 179.5625, "completions/mean_terminated_length": 179.5625, "completions/min_length": 109.25, "completions/min_terminated_length": 109.25, "epoch": 0.041751783965300125, "grad_norm": 1.2602835048997771, "kl": 0.37158203125, "learning_rate": 1.2300712803218834e-06, "loss": 0.0472, "num_tokens": 10773077.0, "reward": 0.2942133641336113, "reward_std": 0.06957495538517833, "rewards/code_reward/mean": 0.2942133641336113, "rewards/code_reward/std": 0.06957494793459773, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.25, "completions/max_terminated_length": 411.25, "completions/mean_length": 237.9375, "completions/mean_terminated_length": 237.9375, "completions/min_length": 83.5, "completions/min_terminated_length": 83.5, "epoch": 0.041863719042955085, "grad_norm": 1.77643334650333, "kl": 0.2418212890625, "learning_rate": 1.2193569822552772e-06, "loss": -0.0534, "num_tokens": 10800323.0, "reward": 0.34760985895991325, "reward_std": 0.09279043786227703, "rewards/code_reward/mean": 0.34760985895991325, "rewards/code_reward/std": 0.09279044345021248, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 704.75, "completions/max_terminated_length": 458.25, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 221.75000762939453, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.041975654120610044, "grad_norm": 1.7108480467121594, "kl": 0.26763916015625, "learning_rate": 1.2087069069041268e-06, "loss": 0.075, "num_tokens": 10826268.0, "reward": 0.09669792652130127, "reward_std": 0.14347750786691904, "rewards/code_reward/mean": 0.09669792652130127, "rewards/code_reward/std": 0.14347750786691904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.25, "completions/max_terminated_length": 446.25, "completions/mean_length": 234.9375, "completions/mean_terminated_length": 234.9375, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.042087589198265, "grad_norm": 1.2434311763858945, "kl": 0.213134765625, "learning_rate": 1.1981215011242654e-06, "loss": 0.131, "num_tokens": 10854130.0, "reward": 0.043518811551621184, "reward_std": 0.047120289877057076, "rewards/code_reward/mean": 0.043518811551621184, "rewards/code_reward/std": 0.047120293602347374, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 136.03125, "completions/mean_terminated_length": 136.03125, "completions/min_length": 54.25, "completions/min_terminated_length": 54.25, "epoch": 0.04219952427591997, "grad_norm": 1.671522684663737, "kl": 0.36328125, "learning_rate": 1.1876012090581184e-06, "loss": 0.0534, "num_tokens": 10877771.0, "reward": 0.5510788485407829, "reward_std": 0.11537208966910839, "rewards/code_reward/mean": 0.5510788485407829, "rewards/code_reward/std": 0.11537209153175354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.25, "completions/max_terminated_length": 304.25, "completions/mean_length": 165.71875, "completions/mean_terminated_length": 165.71875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.04231145935357493, "grad_norm": 1.1250436666643788, "kl": 0.33447265625, "learning_rate": 1.177146472116071e-06, "loss": -0.0308, "num_tokens": 10904074.0, "reward": 0.04710310218797531, "reward_std": 0.030758424138184637, "rewards/code_reward/mean": 0.04710310218797531, "rewards/code_reward/std": 0.030758424138184637, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 151.8125, "completions/mean_terminated_length": 151.8125, "completions/min_length": 73.5, "completions/min_terminated_length": 73.5, "epoch": 0.04242339443122989, "grad_norm": 1.360047496133987, "kl": 0.3671875, "learning_rate": 1.1667577289579462e-06, "loss": -0.0164, "num_tokens": 10930116.0, "reward": 0.4695088779553771, "reward_std": 0.12898865342140198, "rewards/code_reward/mean": 0.4695088779553771, "rewards/code_reward/std": 0.12898865342140198, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.5, "completions/max_terminated_length": 281.5, "completions/mean_length": 191.65625, "completions/mean_terminated_length": 191.65625, "completions/min_length": 91.75, "completions/min_terminated_length": 91.75, "epoch": 0.04253532950888485, "grad_norm": 1.396767149969482, "kl": 0.222412109375, "learning_rate": 1.1564354154746007e-06, "loss": 0.0289, "num_tokens": 10951289.0, "reward": 0.38920454680919647, "reward_std": 0.1452226829715073, "rewards/code_reward/mean": 0.38920454680919647, "rewards/code_reward/std": 0.14522269228473306, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.25, "completions/max_terminated_length": 232.25, "completions/mean_length": 126.9375, "completions/mean_terminated_length": 126.9375, "completions/min_length": 56.75, "completions/min_terminated_length": 56.75, "epoch": 0.04264726458653981, "grad_norm": 2.272607027139455, "kl": 0.4501953125, "learning_rate": 1.146179964769635e-06, "loss": -0.0172, "num_tokens": 10973007.0, "reward": 0.5514450334012508, "reward_std": 0.1807562008034438, "rewards/code_reward/mean": 0.5514450334012508, "rewards/code_reward/std": 0.1807561982423067, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.25, "completions/max_terminated_length": 377.25, "completions/mean_length": 200.40625, "completions/mean_terminated_length": 200.40625, "completions/min_length": 75.5, "completions/min_terminated_length": 75.5, "epoch": 0.042759199664194766, "grad_norm": 1.8894869007416317, "kl": 0.28662109375, "learning_rate": 1.1359918071412195e-06, "loss": 0.0651, "num_tokens": 11000324.0, "reward": 0.3411928308196366, "reward_std": 0.15844399761408567, "rewards/code_reward/mean": 0.3411928308196366, "rewards/code_reward/std": 0.15844399388879538, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.25, "completions/max_terminated_length": 199.25, "completions/mean_length": 122.78125, "completions/mean_terminated_length": 122.78125, "completions/min_length": 74.75, "completions/min_terminated_length": 74.75, "epoch": 0.042871134741849726, "grad_norm": 2.4633639922004784, "kl": 0.448486328125, "learning_rate": 1.1258713700640456e-06, "loss": -0.0042, "num_tokens": 11025333.0, "reward": 0.39490123838186264, "reward_std": 0.09689067304134369, "rewards/code_reward/mean": 0.39490123838186264, "rewards/code_reward/std": 0.09689067304134369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 170.78125, "completions/mean_terminated_length": 170.78125, "completions/min_length": 94.25, "completions/min_terminated_length": 94.25, "epoch": 0.042983069819504685, "grad_norm": 1.5596521493483513, "kl": 0.26416015625, "learning_rate": 1.115819078171383e-06, "loss": -0.0304, "num_tokens": 11052478.0, "reward": 0.11266797501593828, "reward_std": 0.04459898290224373, "rewards/code_reward/mean": 0.11266797501593828, "rewards/code_reward/std": 0.04459898569621146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.75, "completions/max_terminated_length": 396.75, "completions/mean_length": 161.8125, "completions/mean_terminated_length": 161.8125, "completions/min_length": 66.25, "completions/min_terminated_length": 66.25, "epoch": 0.043095004897159644, "grad_norm": 1.2904801129088306, "kl": 0.337890625, "learning_rate": 1.1058353532372667e-06, "loss": 0.0852, "num_tokens": 11072608.0, "reward": 0.39945168420672417, "reward_std": 0.24530693516135216, "rewards/code_reward/mean": 0.39945168420672417, "rewards/code_reward/std": 0.24530693143606186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.5, "completions/max_terminated_length": 270.5, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 80.5, "completions/min_terminated_length": 80.5, "epoch": 0.04320693997481461, "grad_norm": 1.533287258494552, "kl": 0.321533203125, "learning_rate": 1.0959206141587998e-06, "loss": -0.0497, "num_tokens": 11094568.0, "reward": 0.32392971869558096, "reward_std": 0.0603926875628531, "rewards/code_reward/mean": 0.32392971869558096, "rewards/code_reward/std": 0.060392691288143396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.25, "completions/max_terminated_length": 460.25, "completions/mean_length": 244.65625, "completions/mean_terminated_length": 244.65625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.04331887505246957, "grad_norm": 1.673535799840682, "kl": 0.1998291015625, "learning_rate": 1.0860752769385766e-06, "loss": -0.0478, "num_tokens": 11115893.0, "reward": 0.19461633265018463, "reward_std": 0.2882770374417305, "rewards/code_reward/mean": 0.19461633265018463, "rewards/code_reward/std": 0.2882770411670208, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.75, "completions/max_terminated_length": 260.75, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.04343081013012453, "grad_norm": 1.8694819476907374, "kl": 0.28662109375, "learning_rate": 1.0762997546672279e-06, "loss": -0.1618, "num_tokens": 11140117.0, "reward": 0.13581378757953644, "reward_std": 0.1375128449872136, "rewards/code_reward/mean": 0.13581378757953644, "rewards/code_reward/std": 0.1375128524377942, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.25, "completions/max_terminated_length": 363.25, "completions/mean_length": 193.84375, "completions/mean_terminated_length": 193.84375, "completions/min_length": 98.5, "completions/min_terminated_length": 98.5, "epoch": 0.04354274520777949, "grad_norm": 1.0753739650687322, "kl": 0.333984375, "learning_rate": 1.0665944575060914e-06, "loss": 0.0196, "num_tokens": 11165288.0, "reward": 0.2044280730187893, "reward_std": 0.20719696558080614, "rewards/code_reward/mean": 0.2044280730187893, "rewards/code_reward/std": 0.20719696604646742, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.5, "completions/max_terminated_length": 206.5, "completions/mean_length": 101.28125, "completions/mean_terminated_length": 101.28125, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "epoch": 0.04365468028543445, "grad_norm": 2.0710686171209, "kl": 0.34033203125, "learning_rate": 1.056959792669997e-06, "loss": 0.0855, "num_tokens": 11184777.0, "reward": 0.3098377622663975, "reward_std": 0.11287019960582256, "rewards/code_reward/mean": 0.3098377622663975, "rewards/code_reward/std": 0.11287020146846771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.5, "completions/max_terminated_length": 224.5, "completions/mean_length": 128.0625, "completions/mean_terminated_length": 128.0625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.04376661536308941, "grad_norm": 1.6186447722026904, "kl": 0.3369140625, "learning_rate": 1.0473961644101856e-06, "loss": 0.0431, "num_tokens": 11207051.0, "reward": 0.40973464399576187, "reward_std": 0.2817695839330554, "rewards/code_reward/mean": 0.40973464399576187, "rewards/code_reward/std": 0.281769591383636, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.25, "completions/max_terminated_length": 391.25, "completions/mean_length": 222.3125, "completions/mean_terminated_length": 222.3125, "completions/min_length": 73.25, "completions/min_terminated_length": 73.25, "epoch": 0.04387855044074437, "grad_norm": 1.1817397898346427, "kl": 0.214599609375, "learning_rate": 1.037903973997345e-06, "loss": 0.0475, "num_tokens": 11225013.0, "reward": 0.30992063134908676, "reward_std": 0.14644738845527172, "rewards/code_reward/mean": 0.30992063134908676, "rewards/code_reward/std": 0.14644739404320717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.5, "completions/max_terminated_length": 259.5, "completions/mean_length": 149.8125, "completions/mean_terminated_length": 149.8125, "completions/min_length": 86.25, "completions/min_terminated_length": 86.25, "epoch": 0.043990485518399326, "grad_norm": 1.2696184824554122, "kl": 0.26806640625, "learning_rate": 1.0284836197047737e-06, "loss": -0.0078, "num_tokens": 11242503.0, "reward": 0.4278051145374775, "reward_std": 0.09913837909698486, "rewards/code_reward/mean": 0.4278051145374775, "rewards/code_reward/std": 0.09913837816566229, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 150.03125, "completions/mean_terminated_length": 150.03125, "completions/min_length": 65.5, "completions/min_terminated_length": 65.5, "epoch": 0.044102420596054286, "grad_norm": 1.45552546760352, "kl": 0.28857421875, "learning_rate": 1.0191354967916712e-06, "loss": 0.0331, "num_tokens": 11269880.0, "reward": 0.24055082583799958, "reward_std": 0.11007735197199509, "rewards/code_reward/mean": 0.24055082583799958, "rewards/code_reward/std": 0.11007736308965832, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.04421435567370925, "grad_norm": 1.368061303454261, "kl": 0.311279296875, "learning_rate": 1.0098599974865515e-06, "loss": 0.0704, "num_tokens": 11297700.0, "reward": 0.07068161107599735, "reward_std": 0.11775721522280946, "rewards/code_reward/mean": 0.07068161107599735, "rewards/code_reward/std": 0.11775722278980538, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 166.28125, "completions/mean_terminated_length": 166.28125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.04432629075136421, "grad_norm": 2.02580412202015, "kl": 0.312255859375, "learning_rate": 1.0006575109707898e-06, "loss": 0.1445, "num_tokens": 11315909.0, "reward": 0.2796209901571274, "reward_std": 0.20085123018361628, "rewards/code_reward/mean": 0.2796209901571274, "rewards/code_reward/std": 0.200851232977584, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.5, "completions/max_terminated_length": 423.5, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.04443822582901917, "grad_norm": 1.711611197709339, "kl": 0.398681640625, "learning_rate": 9.915284233622877e-07, "loss": -0.0014, "num_tokens": 11345121.0, "reward": 0.3469575219787657, "reward_std": 0.2414399441331625, "rewards/code_reward/mean": 0.3469575219787657, "rewards/code_reward/std": 0.2414399590343237, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.25, "completions/max_terminated_length": 339.25, "completions/mean_length": 163.15625, "completions/mean_terminated_length": 163.15625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.04455016090667413, "grad_norm": 1.6559828290200511, "kl": 0.32080078125, "learning_rate": 9.824731176992796e-07, "loss": 0.0554, "num_tokens": 11366862.0, "reward": 0.21360408567124978, "reward_std": 0.14055794943124056, "rewards/code_reward/mean": 0.21360408567124978, "rewards/code_reward/std": 0.14055794943124056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.5, "completions/max_terminated_length": 250.5, "completions/mean_length": 163.09375, "completions/mean_terminated_length": 163.09375, "completions/min_length": 97.25, "completions/min_terminated_length": 97.25, "epoch": 0.04466209598432909, "grad_norm": 1.4395787879499458, "kl": 0.2861328125, "learning_rate": 9.734919739242543e-07, "loss": 0.0094, "num_tokens": 11390465.0, "reward": 0.37181805819272995, "reward_std": 0.13883061078377068, "rewards/code_reward/mean": 0.37181805819272995, "rewards/code_reward/std": 0.1388306178851053, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.75, "completions/max_terminated_length": 351.75, "completions/mean_length": 168.9375, "completions/mean_terminated_length": 168.9375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.04477403106198405, "grad_norm": 1.5497285885828123, "kl": 0.28857421875, "learning_rate": 9.645853688680177e-07, "loss": -0.0077, "num_tokens": 11412903.0, "reward": 0.22598881646990776, "reward_std": 0.05764714028919116, "rewards/code_reward/mean": 0.22598881646990776, "rewards/code_reward/std": 0.05764713906683028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 810.0, "completions/max_terminated_length": 618.5, "completions/mean_length": 326.8125, "completions/mean_terminated_length": 279.2276916503906, "completions/min_length": 143.75, "completions/min_terminated_length": 143.75, "epoch": 0.04488596613963901, "grad_norm": 1.3195383455561336, "kl": 0.1973876953125, "learning_rate": 9.557536762338786e-07, "loss": 0.1984, "num_tokens": 11445705.0, "reward": 0.4391447389498353, "reward_std": 0.2860143817961216, "rewards/code_reward/mean": 0.4391447389498353, "rewards/code_reward/std": 0.28601440228521824, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.25, "completions/max_terminated_length": 287.25, "completions/mean_length": 152.84375, "completions/mean_terminated_length": 152.84375, "completions/min_length": 75.25, "completions/min_terminated_length": 75.25, "epoch": 0.04499790121729397, "grad_norm": 1.3849099249418695, "kl": 0.2900390625, "learning_rate": 9.46997266581973e-07, "loss": 0.0243, "num_tokens": 11470668.0, "reward": 0.5938801132142544, "reward_std": 0.22660082660149783, "rewards/code_reward/mean": 0.5938801132142544, "rewards/code_reward/std": 0.22660081752110273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.5, "completions/max_terminated_length": 446.5, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 149.25, "completions/min_terminated_length": 149.25, "epoch": 0.04510983629494893, "grad_norm": 1.1229376006957024, "kl": 0.2052001953125, "learning_rate": 9.383165073137115e-07, "loss": -0.0179, "num_tokens": 11493260.0, "reward": 0.36087851971387863, "reward_std": 0.14250769466161728, "rewards/code_reward/mean": 0.36087851971387863, "rewards/code_reward/std": 0.14250769466161728, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.75, "completions/max_terminated_length": 299.75, "completions/mean_length": 161.46875, "completions/mean_terminated_length": 161.46875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.04522177137260389, "grad_norm": 1.3839235753481969, "kl": 0.34619140625, "learning_rate": 9.297117626563687e-07, "loss": 0.1469, "num_tokens": 11513939.0, "reward": 0.6742284968495369, "reward_std": 0.05968676181510091, "rewards/code_reward/mean": 0.6742284968495369, "rewards/code_reward/std": 0.059686762280762196, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.5, "completions/max_terminated_length": 180.5, "completions/mean_length": 114.53125, "completions/mean_terminated_length": 114.53125, "completions/min_length": 73.75, "completions/min_terminated_length": 73.75, "epoch": 0.04533370645025885, "grad_norm": 2.4362013118805237, "kl": 0.326171875, "learning_rate": 9.211833936477957e-07, "loss": 0.0929, "num_tokens": 11532444.0, "reward": 0.33231060579419136, "reward_std": 0.09006076445803046, "rewards/code_reward/mean": 0.33231060579419136, "rewards/code_reward/std": 0.09006076492369175, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.75, "completions/max_terminated_length": 474.75, "completions/mean_length": 210.15625, "completions/mean_terminated_length": 210.15625, "completions/min_length": 100.25, "completions/min_terminated_length": 100.25, "epoch": 0.04544564152791381, "grad_norm": 1.3572230509856378, "kl": 0.225341796875, "learning_rate": 9.127317581212753e-07, "loss": -0.13, "num_tokens": 11553801.0, "reward": 0.4415045604109764, "reward_std": 0.1545610846951604, "rewards/code_reward/mean": 0.4415045604109764, "rewards/code_reward/std": 0.1545610912144184, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 135.875, "completions/mean_terminated_length": 135.875, "completions/min_length": 65.25, "completions/min_terminated_length": 65.25, "epoch": 0.04555757660556877, "grad_norm": 1.7906307431939532, "kl": 0.37890625, "learning_rate": 9.043572106905084e-07, "loss": -0.0301, "num_tokens": 11574885.0, "reward": 0.2516532065346837, "reward_std": 0.1726220678538084, "rewards/code_reward/mean": 0.2516532065346837, "rewards/code_reward/std": 0.17262207716703415, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 714.25, "completions/max_terminated_length": 451.0, "completions/mean_length": 230.8125, "completions/mean_terminated_length": 174.68303680419922, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.04566951168322373, "grad_norm": 2.5575273200953244, "kl": 0.662841796875, "learning_rate": 8.960601027347321e-07, "loss": 0.2496, "num_tokens": 11602207.0, "reward": 0.5628770813345909, "reward_std": 0.1447618722449988, "rewards/code_reward/mean": 0.5628770813345909, "rewards/code_reward/std": 0.14476187201216817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.75, "completions/max_terminated_length": 240.75, "completions/mean_length": 148.78125, "completions/mean_terminated_length": 148.78125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.04578144676087869, "grad_norm": 1.6621037231884788, "kl": 0.3466796875, "learning_rate": 8.878407823839788e-07, "loss": 0.0366, "num_tokens": 11618608.0, "reward": 0.25418527983129025, "reward_std": 0.11003150884062052, "rewards/code_reward/mean": 0.25418527983129025, "rewards/code_reward/std": 0.11003150977194309, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.5, "completions/max_terminated_length": 213.5, "completions/mean_length": 146.90625, "completions/mean_terminated_length": 146.90625, "completions/min_length": 91.25, "completions/min_terminated_length": 91.25, "epoch": 0.04589338183853365, "grad_norm": 1.4559367674936419, "kl": 0.26123046875, "learning_rate": 8.796995945044689e-07, "loss": 0.0374, "num_tokens": 11637933.0, "reward": 0.36544950399547815, "reward_std": 0.024054846144281328, "rewards/code_reward/mean": 0.36544950399547815, "rewards/code_reward/std": 0.02405484637711197, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.5, "completions/max_terminated_length": 236.5, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 87.25, "completions/min_terminated_length": 87.25, "epoch": 0.04600531691618861, "grad_norm": 1.6433420190741899, "kl": 0.28955078125, "learning_rate": 8.716368806841405e-07, "loss": -0.0265, "num_tokens": 11658509.0, "reward": 0.3223713766783476, "reward_std": 0.1642971858382225, "rewards/code_reward/mean": 0.3223713766783476, "rewards/code_reward/std": 0.16429719096049666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.25, "completions/max_terminated_length": 392.25, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "epoch": 0.04611725199384357, "grad_norm": 1.0424618486352764, "kl": 0.280517578125, "learning_rate": 8.636529792183171e-07, "loss": 0.0174, "num_tokens": 11683955.0, "reward": 0.3553215153515339, "reward_std": 0.1284142378717661, "rewards/code_reward/mean": 0.3553215153515339, "rewards/code_reward/std": 0.1284142378717661, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 249.4375, "completions/mean_terminated_length": 249.4375, "completions/min_length": 124.25, "completions/min_terminated_length": 124.25, "epoch": 0.046229187071498534, "grad_norm": 0.9268703240625641, "kl": 0.181884765625, "learning_rate": 8.557482250955144e-07, "loss": 0.0329, "num_tokens": 11707953.0, "reward": 0.5278465449810028, "reward_std": 0.059144818456843495, "rewards/code_reward/mean": 0.5278465449810028, "rewards/code_reward/std": 0.05914481892250478, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.75, "completions/max_terminated_length": 324.75, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "epoch": 0.046341122149153494, "grad_norm": 1.2660479068382882, "kl": 0.270751953125, "learning_rate": 8.479229499833844e-07, "loss": 0.0482, "num_tokens": 11731301.0, "reward": 0.20539462007582188, "reward_std": 0.1615639952942729, "rewards/code_reward/mean": 0.20539462007582188, "rewards/code_reward/std": 0.1615639952942729, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.25, "completions/max_terminated_length": 262.25, "completions/mean_length": 178.9375, "completions/mean_terminated_length": 178.9375, "completions/min_length": 97.75, "completions/min_terminated_length": 97.75, "epoch": 0.04645305722680845, "grad_norm": 1.460499815260377, "kl": 0.33349609375, "learning_rate": 8.401774822147976e-07, "loss": 0.0594, "num_tokens": 11754019.0, "reward": 0.3291256055235863, "reward_std": 0.1305392780341208, "rewards/code_reward/mean": 0.3291256055235863, "rewards/code_reward/std": 0.13053929095622152, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.5, "completions/max_terminated_length": 680.5, "completions/mean_length": 255.3125, "completions/mean_terminated_length": 255.3125, "completions/min_length": 91.25, "completions/min_terminated_length": 91.25, "epoch": 0.04656499230446341, "grad_norm": 1.5950867620592668, "kl": 0.260498046875, "learning_rate": 8.325121467740695e-07, "loss": 0.0056, "num_tokens": 11784677.0, "reward": 0.4860835336148739, "reward_std": 0.19814053922891617, "rewards/code_reward/mean": 0.4860835336148739, "rewards/code_reward/std": 0.19814054295420647, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.75, "completions/max_terminated_length": 380.75, "completions/mean_length": 203.8125, "completions/mean_terminated_length": 203.8125, "completions/min_length": 100.25, "completions/min_terminated_length": 100.25, "epoch": 0.04667692738211837, "grad_norm": 1.4255259834081973, "kl": 0.23291015625, "learning_rate": 8.249272652833226e-07, "loss": 0.0277, "num_tokens": 11812087.0, "reward": 0.1272990070283413, "reward_std": 0.05336737190373242, "rewards/code_reward/mean": 0.1272990070283413, "rewards/code_reward/std": 0.053367371554486454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 193.78125, "completions/mean_terminated_length": 193.78125, "completions/min_length": 109.75, "completions/min_terminated_length": 109.75, "epoch": 0.04678886245977333, "grad_norm": 1.3483016005578479, "kl": 0.382568359375, "learning_rate": 8.174231559889931e-07, "loss": -0.0485, "num_tokens": 11828464.0, "reward": 0.3802599012851715, "reward_std": 0.24896394088864326, "rewards/code_reward/mean": 0.3802599012851715, "rewards/code_reward/std": 0.24896394088864326, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.75, "completions/max_terminated_length": 240.75, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 95.5, "completions/min_terminated_length": 95.5, "epoch": 0.04690079753742829, "grad_norm": 1.4554834982831377, "kl": 0.31591796875, "learning_rate": 8.100001337484787e-07, "loss": 0.0997, "num_tokens": 11847600.0, "reward": 0.47475508879870176, "reward_std": 0.10649433638900518, "rewards/code_reward/mean": 0.47475508879870176, "rewards/code_reward/std": 0.10649433825165033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.5, "completions/max_terminated_length": 308.5, "completions/mean_length": 141.1875, "completions/mean_terminated_length": 141.1875, "completions/min_length": 68.75, "completions/min_terminated_length": 68.75, "epoch": 0.04701273261508325, "grad_norm": 2.2293342326111887, "kl": 0.29931640625, "learning_rate": 8.026585100169251e-07, "loss": -0.2137, "num_tokens": 11868750.0, "reward": 0.5101216156035662, "reward_std": 0.014662902103736997, "rewards/code_reward/mean": 0.5101216156035662, "rewards/code_reward/std": 0.014662901870906353, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.5, "completions/max_terminated_length": 297.5, "completions/mean_length": 166.3125, "completions/mean_terminated_length": 166.3125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.04712466769273821, "grad_norm": 1.7014274221258614, "kl": 0.2890625, "learning_rate": 7.953985928341601e-07, "loss": 0.0663, "num_tokens": 11895960.0, "reward": 0.5694793821312487, "reward_std": 0.16413932980503887, "rewards/code_reward/mean": 0.5694793821312487, "rewards/code_reward/std": 0.16413932980503887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.75, "completions/max_terminated_length": 279.75, "completions/mean_length": 171.78125, "completions/mean_terminated_length": 171.78125, "completions/min_length": 101.5, "completions/min_terminated_length": 101.5, "epoch": 0.047236602770393175, "grad_norm": 1.1761445629861498, "kl": 0.256103515625, "learning_rate": 7.882206868117693e-07, "loss": -0.0198, "num_tokens": 11919857.0, "reward": 0.7860226929187775, "reward_std": 0.15767237346153706, "rewards/code_reward/mean": 0.7860226929187775, "rewards/code_reward/std": 0.1576723720645532, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.5, "completions/max_terminated_length": 335.5, "completions/mean_length": 185.5, "completions/mean_terminated_length": 185.5, "completions/min_length": 92.5, "completions/min_terminated_length": 92.5, "epoch": 0.047348537848048135, "grad_norm": 1.8051288181053138, "kl": 0.281982421875, "learning_rate": 7.81125093120313e-07, "loss": 0.0279, "num_tokens": 11942537.0, "reward": 0.3795018047094345, "reward_std": 0.15914139337837696, "rewards/code_reward/mean": 0.3795018047094345, "rewards/code_reward/std": 0.15914138592779636, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.25, "completions/max_terminated_length": 387.25, "completions/mean_length": 178.40625, "completions/mean_terminated_length": 178.40625, "completions/min_length": 84.75, "completions/min_terminated_length": 84.75, "epoch": 0.047460472925703094, "grad_norm": 1.4032630704318265, "kl": 0.30078125, "learning_rate": 7.741121094766916e-07, "loss": -0.1775, "num_tokens": 11966390.0, "reward": 0.6128955632448196, "reward_std": 0.11446365877054632, "rewards/code_reward/mean": 0.6128955632448196, "rewards/code_reward/std": 0.11446366063319147, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.25, "completions/max_terminated_length": 384.25, "completions/mean_length": 168.40625, "completions/mean_terminated_length": 168.40625, "completions/min_length": 70.25, "completions/min_terminated_length": 70.25, "epoch": 0.047572408003358053, "grad_norm": 1.7373521797945488, "kl": 0.27392578125, "learning_rate": 7.671820301316532e-07, "loss": 0.1031, "num_tokens": 11991491.0, "reward": 0.5329861111240461, "reward_std": 0.2041158601641655, "rewards/code_reward/mean": 0.5329861111240461, "rewards/code_reward/std": 0.2041158601641655, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.5, "completions/max_terminated_length": 376.5, "completions/mean_length": 191.46875, "completions/mean_terminated_length": 191.46875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.04768434308101301, "grad_norm": 2.0077272533591266, "kl": 0.25927734375, "learning_rate": 7.603351458574474e-07, "loss": 0.1358, "num_tokens": 12013706.0, "reward": 0.2916666716337204, "reward_std": 0.07259188406169415, "rewards/code_reward/mean": 0.2916666716337204, "rewards/code_reward/std": 0.072591882199049, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.5, "completions/max_terminated_length": 479.5, "completions/mean_length": 218.28125, "completions/mean_terminated_length": 218.28125, "completions/min_length": 85.25, "completions/min_terminated_length": 85.25, "epoch": 0.04779627815866797, "grad_norm": 1.3492407743127255, "kl": 0.30029296875, "learning_rate": 7.535717439356255e-07, "loss": 0.031, "num_tokens": 12042155.0, "reward": 0.5065476968884468, "reward_std": 0.2563619986176491, "rewards/code_reward/mean": 0.5065476968884468, "rewards/code_reward/std": 0.2563620023429394, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.5, "completions/max_terminated_length": 268.5, "completions/mean_length": 178.71875, "completions/mean_terminated_length": 178.71875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.04790821323632293, "grad_norm": 1.279825584779865, "kl": 0.3115234375, "learning_rate": 7.46892108144986e-07, "loss": -0.0739, "num_tokens": 12066530.0, "reward": 0.5710227191448212, "reward_std": 0.18481200002133846, "rewards/code_reward/mean": 0.5710227191448212, "rewards/code_reward/std": 0.18481199722737074, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.75, "completions/max_terminated_length": 262.75, "completions/mean_length": 157.84375, "completions/mean_terminated_length": 157.84375, "completions/min_length": 91.5, "completions/min_terminated_length": 91.5, "epoch": 0.04802014831397789, "grad_norm": 1.2537079099692274, "kl": 0.32080078125, "learning_rate": 7.402965187496697e-07, "loss": -0.062, "num_tokens": 12093773.0, "reward": 0.40365831553936005, "reward_std": 0.06347889173775911, "rewards/code_reward/mean": 0.40365831553936005, "rewards/code_reward/std": 0.06347889162134379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.75, "completions/max_terminated_length": 342.75, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 84.25, "completions/min_terminated_length": 84.25, "epoch": 0.04813208339163285, "grad_norm": 1.5682030729649754, "kl": 0.3115234375, "learning_rate": 7.337852524873974e-07, "loss": 0.0633, "num_tokens": 12119825.0, "reward": 0.4007348418235779, "reward_std": 0.2273004651069641, "rewards/code_reward/mean": 0.4007348418235779, "rewards/code_reward/std": 0.227300476282835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.75, "completions/max_terminated_length": 307.75, "completions/mean_length": 166.8125, "completions/mean_terminated_length": 166.8125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.04824401846928781, "grad_norm": 1.414420572830449, "kl": 0.25830078125, "learning_rate": 7.273585825578608e-07, "loss": -0.004, "num_tokens": 12141963.0, "reward": 0.09160848939791322, "reward_std": 0.09675811271881685, "rewards/code_reward/mean": 0.09160848939791322, "rewards/code_reward/std": 0.09675811271881685, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 160.6875, "completions/mean_terminated_length": 160.6875, "completions/min_length": 85.75, "completions/min_terminated_length": 85.75, "epoch": 0.048355953546942776, "grad_norm": 1.2819351419525538, "kl": 0.299072265625, "learning_rate": 7.21016778611259e-07, "loss": 0.0442, "num_tokens": 12160385.0, "reward": 0.3042712155729532, "reward_std": 0.1732648597098887, "rewards/code_reward/mean": 0.3042712155729532, "rewards/code_reward/std": 0.17326486064121127, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 199.65625, "completions/mean_terminated_length": 199.65625, "completions/min_length": 90.25, "completions/min_terminated_length": 90.25, "epoch": 0.048467888624597735, "grad_norm": 1.4343614720564808, "kl": 0.3349609375, "learning_rate": 7.147601067369835e-07, "loss": -0.0444, "num_tokens": 12183238.0, "reward": 0.2736266343854368, "reward_std": 0.11790771875530481, "rewards/code_reward/mean": 0.2736266343854368, "rewards/code_reward/std": 0.11790771875530481, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.5, "completions/max_terminated_length": 430.5, "completions/mean_length": 220.1875, "completions/mean_terminated_length": 220.1875, "completions/min_length": 140.25, "completions/min_terminated_length": 140.25, "epoch": 0.048579823702252695, "grad_norm": 1.5149123099259716, "kl": 0.208740234375, "learning_rate": 7.085888294524561e-07, "loss": 0.0555, "num_tokens": 12205652.0, "reward": 0.1229942380450666, "reward_std": 0.1744669363833964, "rewards/code_reward/mean": 0.1229942380450666, "rewards/code_reward/std": 0.17446694057434797, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.5, "completions/max_terminated_length": 280.5, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.048691758779907654, "grad_norm": 1.1887088926887228, "kl": 0.293701171875, "learning_rate": 7.025032056921117e-07, "loss": 0.0018, "num_tokens": 12225632.0, "reward": 0.36100322124548256, "reward_std": 0.18402530439198017, "rewards/code_reward/mean": 0.36100322124548256, "rewards/code_reward/std": 0.18402530439198017, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 165.9375, "completions/mean_terminated_length": 165.9375, "completions/min_length": 77.5, "completions/min_terminated_length": 77.5, "epoch": 0.04880369385756261, "grad_norm": 1.3337355570198512, "kl": 0.3330078125, "learning_rate": 6.965034907965349e-07, "loss": -0.0914, "num_tokens": 12244678.0, "reward": 0.5988663695752621, "reward_std": 0.20593830198049545, "rewards/code_reward/mean": 0.5988663695752621, "rewards/code_reward/std": 0.20593830046709627, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 690.25, "completions/max_terminated_length": 308.5, "completions/mean_length": 242.875, "completions/mean_terminated_length": 187.48214721679688, "completions/min_length": 99.75, "completions/min_terminated_length": 99.75, "epoch": 0.04891562893521757, "grad_norm": 1.333932781238413, "kl": 0.2191162109375, "learning_rate": 6.905899365017462e-07, "loss": 0.1709, "num_tokens": 12265450.0, "reward": 0.28224857337772846, "reward_std": 0.13666313188150525, "rewards/code_reward/mean": 0.28224857337772846, "rewards/code_reward/std": 0.13666313188150525, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.5, "completions/max_terminated_length": 410.5, "completions/mean_length": 176.8125, "completions/mean_terminated_length": 176.8125, "completions/min_length": 76.25, "completions/min_terminated_length": 76.25, "epoch": 0.04902756401287253, "grad_norm": 1.2985967740803328, "kl": 0.283447265625, "learning_rate": 6.847627909286409e-07, "loss": 0.1118, "num_tokens": 12284524.0, "reward": 0.40528881177306175, "reward_std": 0.15535564813762903, "rewards/code_reward/mean": 0.40528881177306175, "rewards/code_reward/std": 0.15535564627498388, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.25, "completions/max_terminated_length": 376.25, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 91.75, "completions/min_terminated_length": 91.75, "epoch": 0.04913949909052749, "grad_norm": 1.1950680004085388, "kl": 0.315185546875, "learning_rate": 6.790222985725761e-07, "loss": 0.025, "num_tokens": 12306080.0, "reward": 0.48305153474211693, "reward_std": 0.12105439510196447, "rewards/code_reward/mean": 0.48305153474211693, "rewards/code_reward/std": 0.12105440441519022, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.5, "completions/max_terminated_length": 223.5, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 79.5, "completions/min_terminated_length": 79.5, "epoch": 0.04925143416818245, "grad_norm": 1.7349156594295274, "kl": 0.23681640625, "learning_rate": 6.733687002931141e-07, "loss": -0.0181, "num_tokens": 12327448.0, "reward": 0.3754356447607279, "reward_std": 0.11030078027397394, "rewards/code_reward/mean": 0.3754356447607279, "rewards/code_reward/std": 0.11030078679323196, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.5, "completions/max_terminated_length": 367.5, "completions/mean_length": 209.40625, "completions/mean_terminated_length": 209.40625, "completions/min_length": 101.75, "completions/min_terminated_length": 101.75, "epoch": 0.04936336924583742, "grad_norm": 1.4076525318028708, "kl": 0.3115234375, "learning_rate": 6.678022333039158e-07, "loss": -0.0373, "num_tokens": 12347901.0, "reward": 0.09711253456771374, "reward_std": 0.057268128264695406, "rewards/code_reward/mean": 0.09711253456771374, "rewards/code_reward/std": 0.05726812733337283, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.25, "completions/max_terminated_length": 265.25, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 74.75, "completions/min_terminated_length": 74.75, "epoch": 0.049475304323492376, "grad_norm": 1.4640386964818788, "kl": 0.293701171875, "learning_rate": 6.623231311627876e-07, "loss": 0.0452, "num_tokens": 12372045.0, "reward": 0.4493050128221512, "reward_std": 0.08245376159902662, "rewards/code_reward/mean": 0.4493050128221512, "rewards/code_reward/std": 0.08245376858394593, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.25, "completions/max_terminated_length": 381.25, "completions/mean_length": 170.96875, "completions/mean_terminated_length": 170.96875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.049587239401147336, "grad_norm": 2.1086386312820666, "kl": 0.27001953125, "learning_rate": 6.569316237618811e-07, "loss": 0.1217, "num_tokens": 12396860.0, "reward": 0.34658948611468077, "reward_std": 0.23270382836926728, "rewards/code_reward/mean": 0.34658948611468077, "rewards/code_reward/std": 0.23270384327042848, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.75, "completions/max_terminated_length": 419.75, "completions/mean_length": 220.5625, "completions/mean_terminated_length": 220.5625, "completions/min_length": 120.25, "completions/min_terminated_length": 120.25, "epoch": 0.049699174478802295, "grad_norm": 1.4630455489204783, "kl": 0.26171875, "learning_rate": 6.516279373180499e-07, "loss": 0.2184, "num_tokens": 12422750.0, "reward": 0.37283046543598175, "reward_std": 0.12992971763014793, "rewards/code_reward/mean": 0.37283046543598175, "rewards/code_reward/std": 0.12992972321808338, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.25, "completions/max_terminated_length": 268.25, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 72.75, "completions/min_terminated_length": 72.75, "epoch": 0.049811109556457255, "grad_norm": 1.6080266837223955, "kl": 0.27734375, "learning_rate": 6.464122943633543e-07, "loss": -0.0419, "num_tokens": 12441410.0, "reward": 0.14340316224843264, "reward_std": 0.15656377002596855, "rewards/code_reward/mean": 0.14340316224843264, "rewards/code_reward/std": 0.15656376257538795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 222.3125, "completions/mean_terminated_length": 222.3125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.049923044634112214, "grad_norm": 1.2494209477442706, "kl": 0.2666015625, "learning_rate": 6.412849137357271e-07, "loss": -0.0008, "num_tokens": 12469060.0, "reward": 0.41476833214983344, "reward_std": 0.1326767287682742, "rewards/code_reward/mean": 0.41476833214983344, "rewards/code_reward/std": 0.1326767250429839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.25, "completions/max_terminated_length": 367.25, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "epoch": 0.05003497971176717, "grad_norm": 1.5486642122578553, "kl": 0.235107421875, "learning_rate": 6.3624601056979e-07, "loss": 0.1428, "num_tokens": 12493716.0, "reward": 0.5466772168874741, "reward_std": 0.3743356466293335, "rewards/code_reward/mean": 0.5466772168874741, "rewards/code_reward/std": 0.37433566339313984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.5, "completions/max_terminated_length": 450.5, "completions/mean_length": 191.28125, "completions/mean_terminated_length": 191.28125, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.05014691478942213, "grad_norm": 1.3722991110183906, "kl": 0.2452392578125, "learning_rate": 6.312957962878278e-07, "loss": 0.2083, "num_tokens": 12519901.0, "reward": 0.4153126999735832, "reward_std": 0.04972913861274719, "rewards/code_reward/mean": 0.4153126999735832, "rewards/code_reward/std": 0.04972913861274719, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.25, "completions/max_terminated_length": 242.25, "completions/mean_length": 156.4375, "completions/mean_terminated_length": 156.4375, "completions/min_length": 76.75, "completions/min_terminated_length": 76.75, "epoch": 0.05025884986707709, "grad_norm": 1.6733807611752833, "kl": 0.339599609375, "learning_rate": 6.264344785909181e-07, "loss": 0.0653, "num_tokens": 12537763.0, "reward": 0.27301738993264735, "reward_std": 0.14166639209724963, "rewards/code_reward/mean": 0.27301738993264735, "rewards/code_reward/std": 0.14166639978066087, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 685.0, "completions/max_terminated_length": 364.25, "completions/mean_length": 219.84375, "completions/mean_terminated_length": 163.70536041259766, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.05037078494473206, "grad_norm": 1.3730882038714862, "kl": 0.3033447265625, "learning_rate": 6.216622614502149e-07, "loss": 0.2151, "num_tokens": 12564502.0, "reward": 0.27368341060355306, "reward_std": 0.12103560357354581, "rewards/code_reward/mean": 0.27368341060355306, "rewards/code_reward/std": 0.1210356056690216, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.05048272002238702, "grad_norm": 1.1120930858197564, "kl": 0.256103515625, "learning_rate": 6.169793450983916e-07, "loss": 0.0663, "num_tokens": 12595766.0, "reward": 0.2891203761100769, "reward_std": 0.005760519183240831, "rewards/code_reward/mean": 0.2891203761100769, "rewards/code_reward/std": 0.005760519299656153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.75, "completions/max_terminated_length": 274.75, "completions/mean_length": 180.71875, "completions/mean_terminated_length": 180.71875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.05059465510004198, "grad_norm": 1.5436514492166131, "kl": 0.3642578125, "learning_rate": 6.123859260212393e-07, "loss": 0.07, "num_tokens": 12617805.0, "reward": 0.3369871713221073, "reward_std": 0.1278561158105731, "rewards/code_reward/mean": 0.3369871713221073, "rewards/code_reward/std": 0.12785612046718597, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.5, "completions/max_terminated_length": 417.5, "completions/mean_length": 189.8125, "completions/mean_terminated_length": 189.8125, "completions/min_length": 80.25, "completions/min_terminated_length": 80.25, "epoch": 0.050706590177696936, "grad_norm": 1.655439639393973, "kl": 0.302978515625, "learning_rate": 6.07882196949423e-07, "loss": -0.0797, "num_tokens": 12641655.0, "reward": 0.19023456424474716, "reward_std": 0.11184495687484741, "rewards/code_reward/mean": 0.19023456424474716, "rewards/code_reward/std": 0.11184496060013771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 267.09375, "completions/mean_terminated_length": 267.09375, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "epoch": 0.050818525255351896, "grad_norm": 1.0464296647386528, "kl": 0.1806640625, "learning_rate": 6.034683468503948e-07, "loss": -0.0225, "num_tokens": 12663874.0, "reward": 0.3470753263682127, "reward_std": 0.1855549574829638, "rewards/code_reward/mean": 0.3470753263682127, "rewards/code_reward/std": 0.1855549574829638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 245.15625, "completions/mean_terminated_length": 245.15625, "completions/min_length": 89.75, "completions/min_terminated_length": 89.75, "epoch": 0.050930460333006855, "grad_norm": 1.3181084417709616, "kl": 0.22265625, "learning_rate": 5.991445609204641e-07, "loss": -0.2186, "num_tokens": 12690983.0, "reward": 0.3848821893334389, "reward_std": 0.1244323942810297, "rewards/code_reward/mean": 0.3848821893334389, "rewards/code_reward/std": 0.12443239195272326, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.5, "completions/max_terminated_length": 298.5, "completions/mean_length": 179.1875, "completions/mean_terminated_length": 179.1875, "completions/min_length": 97.5, "completions/min_terminated_length": 97.5, "epoch": 0.051042395410661814, "grad_norm": 1.1842353804957628, "kl": 0.24755859375, "learning_rate": 5.949110205770292e-07, "loss": -0.0126, "num_tokens": 12714285.0, "reward": 0.23306879866868258, "reward_std": 0.058227866189554334, "rewards/code_reward/mean": 0.23306879866868258, "rewards/code_reward/std": 0.05822786991484463, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.75, "completions/max_terminated_length": 424.75, "completions/mean_length": 212.28125, "completions/mean_terminated_length": 212.28125, "completions/min_length": 89.25, "completions/min_terminated_length": 89.25, "epoch": 0.051154330488316774, "grad_norm": 1.361690839229129, "kl": 0.21875, "learning_rate": 5.90767903450964e-07, "loss": 0.0986, "num_tokens": 12738542.0, "reward": 0.07210950274020433, "reward_std": 0.05199644831009209, "rewards/code_reward/mean": 0.07210950274020433, "rewards/code_reward/std": 0.05199644842650741, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.75, "completions/max_terminated_length": 288.75, "completions/mean_length": 172.09375, "completions/mean_terminated_length": 172.09375, "completions/min_length": 76.25, "completions/min_terminated_length": 76.25, "epoch": 0.05126626556597173, "grad_norm": 1.627223053243688, "kl": 0.25048828125, "learning_rate": 5.867153833791652e-07, "loss": -0.0443, "num_tokens": 12761809.0, "reward": 0.2288264101371169, "reward_std": 0.18154687527567148, "rewards/code_reward/mean": 0.2288264101371169, "rewards/code_reward/std": 0.18154688365757465, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 853.75, "completions/max_terminated_length": 403.0, "completions/mean_length": 259.6875, "completions/mean_terminated_length": 201.59375381469727, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "epoch": 0.0513782006436267, "grad_norm": 1.37901004815236, "kl": 0.231201171875, "learning_rate": 5.827536303972587e-07, "loss": 0.3202, "num_tokens": 12797623.0, "reward": 0.4052652306854725, "reward_std": 0.09384694416075945, "rewards/code_reward/mean": 0.4052652306854725, "rewards/code_reward/std": 0.09384695184417069, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.5, "completions/max_terminated_length": 467.5, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 88.25, "completions/min_terminated_length": 88.25, "epoch": 0.05149013572128166, "grad_norm": 1.1377121561938153, "kl": 0.241943359375, "learning_rate": 5.78882810732465e-07, "loss": -0.0275, "num_tokens": 12819217.0, "reward": 0.45869156159460545, "reward_std": 0.021421764977276325, "rewards/code_reward/mean": 0.45869156159460545, "rewards/code_reward/std": 0.021421766839921474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.25, "completions/max_terminated_length": 263.25, "completions/mean_length": 155.9375, "completions/mean_terminated_length": 155.9375, "completions/min_length": 84.5, "completions/min_terminated_length": 84.5, "epoch": 0.05160207079893662, "grad_norm": 1.1807451529495485, "kl": 0.306884765625, "learning_rate": 5.75103086796625e-07, "loss": -0.0194, "num_tokens": 12843095.0, "reward": 0.018822902347892523, "reward_std": 0.015156067907810211, "rewards/code_reward/mean": 0.018822902347892523, "rewards/code_reward/std": 0.015156067907810211, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 258.4375, "completions/mean_terminated_length": 258.4375, "completions/min_length": 161.25, "completions/min_terminated_length": 161.25, "epoch": 0.05171400587659158, "grad_norm": 1.336417383594584, "kl": 0.199951171875, "learning_rate": 5.714146171793846e-07, "loss": 0.1694, "num_tokens": 12866173.0, "reward": 0.13850605115294456, "reward_std": 0.0918186865746975, "rewards/code_reward/mean": 0.13850605115294456, "rewards/code_reward/std": 0.09181869029998779, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.5, "completions/max_terminated_length": 452.5, "completions/mean_length": 190.40625, "completions/mean_terminated_length": 190.40625, "completions/min_length": 83.75, "completions/min_terminated_length": 83.75, "epoch": 0.05182594095424654, "grad_norm": 1.7230754097104986, "kl": 0.298583984375, "learning_rate": 5.678175566415422e-07, "loss": 0.0863, "num_tokens": 12892290.0, "reward": 0.33602308854460716, "reward_std": 0.07540364377200603, "rewards/code_reward/mean": 0.33602308854460716, "rewards/code_reward/std": 0.07540364749729633, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.75, "completions/max_terminated_length": 373.75, "completions/mean_length": 159.21875, "completions/mean_terminated_length": 159.21875, "completions/min_length": 69.25, "completions/min_terminated_length": 69.25, "epoch": 0.051937876031901496, "grad_norm": 1.5832482382803177, "kl": 0.1796875, "learning_rate": 5.643120561085528e-07, "loss": -0.0099, "num_tokens": 12911025.0, "reward": 0.5693264603614807, "reward_std": 0.09451888594776392, "rewards/code_reward/mean": 0.5693264603614807, "rewards/code_reward/std": 0.09451888781040907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 771.25, "completions/max_terminated_length": 698.25, "completions/mean_length": 317.71875, "completions/mean_terminated_length": 270.93304443359375, "completions/min_length": 121.25, "completions/min_terminated_length": 121.25, "epoch": 0.052049811109556456, "grad_norm": 0.5755700855439375, "kl": 0.2149658203125, "learning_rate": 5.608982626641991e-07, "loss": 0.033, "num_tokens": 12946576.0, "reward": 0.3332868255674839, "reward_std": 0.08039725571870804, "rewards/code_reward/mean": 0.3332868255674839, "rewards/code_reward/std": 0.08039725571870804, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.25, "completions/max_terminated_length": 229.25, "completions/mean_length": 158.65625, "completions/mean_terminated_length": 158.65625, "completions/min_length": 84.25, "completions/min_terminated_length": 84.25, "epoch": 0.052161746187211415, "grad_norm": 1.4999164462281511, "kl": 0.25244140625, "learning_rate": 5.575763195444166e-07, "loss": 0.1101, "num_tokens": 12962893.0, "reward": 0.26572345197200775, "reward_std": 0.08325213519856334, "rewards/code_reward/mean": 0.26572345197200775, "rewards/code_reward/std": 0.08325213845819235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.75, "completions/max_terminated_length": 256.75, "completions/mean_length": 156.65625, "completions/mean_terminated_length": 156.65625, "completions/min_length": 84.25, "completions/min_terminated_length": 84.25, "epoch": 0.052273681264866374, "grad_norm": 1.5654218625865297, "kl": 0.29931640625, "learning_rate": 5.543463661312847e-07, "loss": 0.0124, "num_tokens": 12989394.0, "reward": 0.4510860964655876, "reward_std": 0.038132989313453436, "rewards/code_reward/mean": 0.4510860964655876, "rewards/code_reward/std": 0.03813299024477601, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.75, "completions/max_terminated_length": 284.75, "completions/mean_length": 189.0625, "completions/mean_terminated_length": 189.0625, "completions/min_length": 110.25, "completions/min_terminated_length": 110.25, "epoch": 0.05238561634252134, "grad_norm": 1.1317457646109947, "kl": 0.223388671875, "learning_rate": 5.512085379471808e-07, "loss": -0.0249, "num_tokens": 13014908.0, "reward": 0.3786849081516266, "reward_std": 0.1657154718413949, "rewards/code_reward/mean": 0.3786849081516266, "rewards/code_reward/std": 0.16571548115462065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.25, "completions/max_terminated_length": 308.25, "completions/mean_length": 182.21875, "completions/mean_terminated_length": 182.21875, "completions/min_length": 90.75, "completions/min_terminated_length": 90.75, "epoch": 0.0524975514201763, "grad_norm": 1.3580873503854825, "kl": 0.2578125, "learning_rate": 5.481629666490903e-07, "loss": 0.041, "num_tokens": 13033779.0, "reward": 0.6068142354488373, "reward_std": 0.21611789241433144, "rewards/code_reward/mean": 0.6068142354488373, "rewards/code_reward/std": 0.21611790172755718, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.75, "completions/max_terminated_length": 379.75, "completions/mean_length": 216.59375, "completions/mean_terminated_length": 216.59375, "completions/min_length": 120.5, "completions/min_terminated_length": 120.5, "epoch": 0.05260948649783126, "grad_norm": 1.2530077335631926, "kl": 0.28271484375, "learning_rate": 5.452097800230853e-07, "loss": 0.0203, "num_tokens": 13058070.0, "reward": 0.4219468259252608, "reward_std": 0.08188007143326104, "rewards/code_reward/mean": 0.4219468259252608, "rewards/code_reward/std": 0.08188007143326104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.25, "completions/max_terminated_length": 373.25, "completions/mean_length": 230.09375, "completions/mean_terminated_length": 230.09375, "completions/min_length": 153.25, "completions/min_terminated_length": 153.25, "epoch": 0.05272142157548622, "grad_norm": 1.8321679081555318, "kl": 0.239013671875, "learning_rate": 5.423491019789623e-07, "loss": -0.1368, "num_tokens": 13082769.0, "reward": 0.30861951038241386, "reward_std": 0.16406975965946913, "rewards/code_reward/mean": 0.30861951038241386, "rewards/code_reward/std": 0.16406976664438844, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.75, "completions/max_terminated_length": 404.75, "completions/mean_length": 182.65625, "completions/mean_terminated_length": 182.65625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.05283335665314118, "grad_norm": 0.9729476481805254, "kl": 0.255126953125, "learning_rate": 5.395810525450425e-07, "loss": 0.0919, "num_tokens": 13106534.0, "reward": 0.21364107308909297, "reward_std": 0.09037529258057475, "rewards/code_reward/mean": 0.21364107308909297, "rewards/code_reward/std": 0.09037529304623604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.5, "completions/max_terminated_length": 315.5, "completions/mean_length": 170.96875, "completions/mean_terminated_length": 170.96875, "completions/min_length": 92.75, "completions/min_terminated_length": 92.75, "epoch": 0.05294529173079614, "grad_norm": 0.9407242390344841, "kl": 0.236083984375, "learning_rate": 5.369057478631359e-07, "loss": 0.0076, "num_tokens": 13125717.0, "reward": 0.18790849673678167, "reward_std": 0.11648390302434564, "rewards/code_reward/mean": 0.18790849673678167, "rewards/code_reward/std": 0.11648390302434564, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 287.9375, "completions/mean_terminated_length": 287.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0530572268084511, "grad_norm": 0.6022039821926372, "kl": 0.2093505859375, "learning_rate": 5.343233001836694e-07, "loss": -0.0311, "num_tokens": 13152515.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/code_reward/mean": 0.46875, "rewards/code_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.75, "completions/max_terminated_length": 255.75, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 157.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.053169161886106056, "grad_norm": 1.3145736475742311, "kl": 0.292236328125, "learning_rate": 5.318338178609754e-07, "loss": -0.0802, "num_tokens": 13178797.0, "reward": 0.07483806018717587, "reward_std": 0.034171308507211506, "rewards/code_reward/mean": 0.07483806018717587, "rewards/code_reward/std": 0.03417130699381232, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.25, "completions/max_terminated_length": 396.25, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "epoch": 0.053281096963761015, "grad_norm": 1.0884074198105211, "kl": 0.156005859375, "learning_rate": 5.294374053487459e-07, "loss": 0.0987, "num_tokens": 13203897.0, "reward": 0.15914655849337578, "reward_std": 0.08056560717523098, "rewards/code_reward/mean": 0.15914655849337578, "rewards/code_reward/std": 0.0805656099691987, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.75, "completions/max_terminated_length": 263.75, "completions/mean_length": 168.9375, "completions/mean_terminated_length": 168.9375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.05339303204141598, "grad_norm": 0.9450638032428148, "kl": 0.3330078125, "learning_rate": 5.271341631956511e-07, "loss": -0.038, "num_tokens": 13233671.0, "reward": 0.5676594115793705, "reward_std": 0.0675080195069313, "rewards/code_reward/mean": 0.5676594115793705, "rewards/code_reward/std": 0.0675080232322216, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.75, "completions/max_terminated_length": 334.75, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 88.75, "completions/min_terminated_length": 88.75, "epoch": 0.05350496711907094, "grad_norm": 1.336348928731024, "kl": 0.270751953125, "learning_rate": 5.249241880411181e-07, "loss": -0.0158, "num_tokens": 13265057.0, "reward": 0.381644893437624, "reward_std": 0.09550960175693035, "rewards/code_reward/mean": 0.381644893437624, "rewards/code_reward/std": 0.09550959896296263, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.25, "completions/max_terminated_length": 299.25, "completions/mean_length": 193.1875, "completions/mean_terminated_length": 193.1875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.0536169021967259, "grad_norm": 1.5049532201349622, "kl": 0.23974609375, "learning_rate": 5.228075726112785e-07, "loss": 0.0894, "num_tokens": 13285927.0, "reward": 0.16660759504884481, "reward_std": 0.04511617706157267, "rewards/code_reward/mean": 0.16660759504884481, "rewards/code_reward/std": 0.04511618078686297, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.5, "completions/max_terminated_length": 325.5, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 116.75, "completions/min_terminated_length": 116.75, "epoch": 0.05372883727438086, "grad_norm": 1.3864869015820263, "kl": 0.21337890625, "learning_rate": 5.207844057150768e-07, "loss": 0.158, "num_tokens": 13303407.0, "reward": 0.6197916716337204, "reward_std": 0.20998739823698997, "rewards/code_reward/mean": 0.6197916716337204, "rewards/code_reward/std": 0.20998739078640938, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.5, "completions/max_terminated_length": 265.5, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 92.75, "completions/min_terminated_length": 92.75, "epoch": 0.05384077235203582, "grad_norm": 1.7579933265437377, "kl": 0.256591796875, "learning_rate": 5.188547722405437e-07, "loss": 0.0498, "num_tokens": 13323183.0, "reward": 0.30053258687257767, "reward_std": 0.14006465952843428, "rewards/code_reward/mean": 0.30053258687257767, "rewards/code_reward/std": 0.1400646585971117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.75, "completions/max_terminated_length": 254.75, "completions/mean_length": 146.4375, "completions/mean_terminated_length": 146.4375, "completions/min_length": 70.25, "completions/min_terminated_length": 70.25, "epoch": 0.05395270742969078, "grad_norm": 1.4622693544793195, "kl": 0.31591796875, "learning_rate": 5.170187531512351e-07, "loss": 0.0813, "num_tokens": 13348197.0, "reward": 0.2889851483050734, "reward_std": 0.04471541300881654, "rewards/code_reward/mean": 0.2889851483050734, "rewards/code_reward/std": 0.044715409399941564, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.5, "completions/max_terminated_length": 292.5, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 100.75, "completions/min_terminated_length": 100.75, "epoch": 0.05406464250734574, "grad_norm": 1.1459398821291238, "kl": 0.280029296875, "learning_rate": 5.152764254828348e-07, "loss": 0.1023, "num_tokens": 13372969.0, "reward": 0.5509072579443455, "reward_std": 0.1530819907784462, "rewards/code_reward/mean": 0.5509072579443455, "rewards/code_reward/std": 0.1530819982290268, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.25, "completions/max_terminated_length": 220.25, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 80.5, "completions/min_terminated_length": 80.5, "epoch": 0.0541765775850007, "grad_norm": 1.5763809825687893, "kl": 0.2509765625, "learning_rate": 5.136278623399225e-07, "loss": -0.0076, "num_tokens": 13397611.0, "reward": 0.43999266996979713, "reward_std": 0.15885511133819818, "rewards/code_reward/mean": 0.43999266996979713, "rewards/code_reward/std": 0.15885510575026274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.25, "completions/max_terminated_length": 348.25, "completions/mean_length": 190.71875, "completions/mean_terminated_length": 190.71875, "completions/min_length": 70.25, "completions/min_terminated_length": 70.25, "epoch": 0.05428851266265566, "grad_norm": 1.543893067503445, "kl": 0.22705078125, "learning_rate": 5.120731328929058e-07, "loss": 0.1822, "num_tokens": 13421994.0, "reward": 0.4371974468231201, "reward_std": 0.06127816252410412, "rewards/code_reward/mean": 0.4371974468231201, "rewards/code_reward/std": 0.06127816252410412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 158.1875, "completions/mean_terminated_length": 158.1875, "completions/min_length": 79.25, "completions/min_terminated_length": 79.25, "epoch": 0.05440044774031062, "grad_norm": 2.287656708183697, "kl": 0.248046875, "learning_rate": 5.106123023751187e-07, "loss": 0.1385, "num_tokens": 13446792.0, "reward": 0.37210020469501615, "reward_std": 0.13278006156906486, "rewards/code_reward/mean": 0.37210020469501615, "rewards/code_reward/std": 0.132780060172081, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.75, "completions/max_terminated_length": 296.75, "completions/mean_length": 184.6875, "completions/mean_terminated_length": 184.6875, "completions/min_length": 116.75, "completions/min_terminated_length": 116.75, "epoch": 0.05451238281796558, "grad_norm": 1.5495749732421749, "kl": 0.2958984375, "learning_rate": 5.092454320800833e-07, "loss": 0.0935, "num_tokens": 13472366.0, "reward": 0.29983099177479744, "reward_std": 0.1254198516253382, "rewards/code_reward/mean": 0.29983099177479744, "rewards/code_reward/std": 0.1254198516253382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 726.5, "completions/max_terminated_length": 252.75, "completions/mean_length": 214.78125, "completions/mean_terminated_length": 153.8258934020996, "completions/min_length": 74.75, "completions/min_terminated_length": 74.75, "epoch": 0.05462431789562054, "grad_norm": 1.1195840237499781, "kl": 0.3201904296875, "learning_rate": 5.079725793589405e-07, "loss": 0.0209, "num_tokens": 13498479.0, "reward": 0.5496091386303306, "reward_std": 0.08358209393918514, "rewards/code_reward/mean": 0.5496091386303306, "rewards/code_reward/std": 0.08358209580183029, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.5, "completions/max_terminated_length": 250.5, "completions/mean_length": 134.15625, "completions/mean_terminated_length": 134.15625, "completions/min_length": 74.75, "completions/min_terminated_length": 74.75, "epoch": 0.0547362529732755, "grad_norm": 1.6494391124504628, "kl": 0.277099609375, "learning_rate": 5.067937976180407e-07, "loss": 0.158, "num_tokens": 13520068.0, "reward": 0.4070262387394905, "reward_std": 0.1368686156347394, "rewards/code_reward/mean": 0.4070262387394905, "rewards/code_reward/std": 0.13686862308532, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 142.4375, "completions/mean_terminated_length": 142.4375, "completions/min_length": 90.75, "completions/min_terminated_length": 90.75, "epoch": 0.05484818805093046, "grad_norm": 1.8021016908957268, "kl": 0.306640625, "learning_rate": 5.057091363167046e-07, "loss": -0.0293, "num_tokens": 13540754.0, "reward": 0.2056608572602272, "reward_std": 0.09596531838178635, "rewards/code_reward/mean": 0.2056608572602272, "rewards/code_reward/std": 0.09596531558781862, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.75, "completions/max_terminated_length": 342.75, "completions/mean_length": 205.40625, "completions/mean_terminated_length": 205.40625, "completions/min_length": 105.25, "completions/min_terminated_length": 105.25, "epoch": 0.05496012312858542, "grad_norm": 1.1900185406401222, "kl": 0.287109375, "learning_rate": 5.047186409651489e-07, "loss": -0.002, "num_tokens": 13564991.0, "reward": 0.4886061754077673, "reward_std": 0.2226488906890154, "rewards/code_reward/mean": 0.4886061754077673, "rewards/code_reward/std": 0.22264889813959599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.5, "completions/max_terminated_length": 295.5, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 101.75, "completions/min_terminated_length": 101.75, "epoch": 0.05507205820624038, "grad_norm": 1.2327221390909047, "kl": 0.21728515625, "learning_rate": 5.038223531225742e-07, "loss": -0.0388, "num_tokens": 13586065.0, "reward": 0.5599798411130905, "reward_std": 0.1288916040211916, "rewards/code_reward/mean": 0.5599798411130905, "rewards/code_reward/std": 0.12889160588383675, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.5, "completions/max_terminated_length": 265.5, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 77.25, "completions/min_terminated_length": 77.25, "epoch": 0.05518399328389534, "grad_norm": 1.7391260110979152, "kl": 0.26513671875, "learning_rate": 5.030203103954232e-07, "loss": -0.1875, "num_tokens": 13605021.0, "reward": 0.431189201772213, "reward_std": 0.26025911793112755, "rewards/code_reward/mean": 0.431189201772213, "rewards/code_reward/std": 0.2602591188624501, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.25, "completions/max_terminated_length": 289.25, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0552959283615503, "grad_norm": 1.4124131086208604, "kl": 0.2646484375, "learning_rate": 5.023125464358026e-07, "loss": 0.0666, "num_tokens": 13624443.0, "reward": 0.33201567456126213, "reward_std": 0.02758226078003645, "rewards/code_reward/mean": 0.33201567456126213, "rewards/code_reward/std": 0.027582260314375162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.25, "completions/max_terminated_length": 228.25, "completions/mean_length": 148.9375, "completions/mean_terminated_length": 148.9375, "completions/min_length": 84.5, "completions/min_terminated_length": 84.5, "epoch": 0.055407863439205264, "grad_norm": 1.6145368535030744, "kl": 0.35107421875, "learning_rate": 5.016990909400709e-07, "loss": -0.0059, "num_tokens": 13651457.0, "reward": 0.326155461370945, "reward_std": 0.1283545382320881, "rewards/code_reward/mean": 0.326155461370945, "rewards/code_reward/std": 0.1283545382320881, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.5, "completions/max_terminated_length": 320.5, "completions/mean_length": 155.40625, "completions/mean_terminated_length": 155.40625, "completions/min_length": 78.75, "completions/min_terminated_length": 78.75, "epoch": 0.05551979851686022, "grad_norm": 1.6838493896543587, "kl": 0.251220703125, "learning_rate": 5.011799696475915e-07, "loss": 0.0376, "num_tokens": 13676038.0, "reward": 0.4704548120498657, "reward_std": 0.23622475564479828, "rewards/code_reward/mean": 0.4704548120498657, "rewards/code_reward/std": 0.23622475564479828, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 189.15625, "completions/mean_terminated_length": 189.15625, "completions/min_length": 118.75, "completions/min_terminated_length": 118.75, "epoch": 0.05563173359451518, "grad_norm": 1.7113766777536588, "kl": 0.2880859375, "learning_rate": 5.007552043396547e-07, "loss": -0.0331, "num_tokens": 13705947.0, "reward": 0.404338245280087, "reward_std": 0.2840197389014065, "rewards/code_reward/mean": 0.404338245280087, "rewards/code_reward/std": 0.2840197426266968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.75, "completions/max_terminated_length": 386.75, "completions/mean_length": 227.40625, "completions/mean_terminated_length": 227.40625, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "epoch": 0.05574366867217014, "grad_norm": 1.3321241616316153, "kl": 0.192626953125, "learning_rate": 5.004248128385618e-07, "loss": 0.1036, "num_tokens": 13729656.0, "reward": 0.19941096380352974, "reward_std": 0.1264824215322733, "rewards/code_reward/mean": 0.19941096380352974, "rewards/code_reward/std": 0.1264824327081442, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 73.5, "completions/min_terminated_length": 73.5, "epoch": 0.0558556037498251, "grad_norm": 1.4751264574193441, "kl": 0.318359375, "learning_rate": 5.001888090068784e-07, "loss": -0.0364, "num_tokens": 13749472.0, "reward": 0.44989876449108124, "reward_std": 0.047168461605906487, "rewards/code_reward/mean": 0.44989876449108124, "rewards/code_reward/std": 0.04716846067458391, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 119.34375, "completions/mean_terminated_length": 119.34375, "completions/min_length": 56.75, "completions/min_terminated_length": 56.75, "epoch": 0.05596753882748006, "grad_norm": 0.48660836341370794, "kl": 0.421875, "learning_rate": 5.000472027468528e-07, "loss": 0.0205, "num_tokens": 13768043.0, "reward": 0.5847536753863096, "reward_std": 0.04997873678803444, "rewards/code_reward/mean": 0.5847536753863096, "rewards/code_reward/std": 0.04997873678803444, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 500 }, { "epoch": 0.05596753882748006, "step": 500, "total_flos": 0.0, "train_loss": 0.02433196935596061, "train_runtime": 50427.8575, "train_samples_per_second": 0.317, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 13768043, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }