{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.992, "eval_steps": 500, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 140.9583396911621, "epoch": 0.016, "grad_norm": 1.4258953228320013, "kl": 0.0, "learning_rate": 1.25e-07, "loss": 0.0, "reward": 0.5152640044689178, "reward_std": 0.5508254170417786, "rewards/correct_code_reward_func": 0.2291666716337204, "rewards/len_reward_func": 0.28609737753868103, "step": 1 }, { "completion_length": 131.50000762939453, "epoch": 0.032, "grad_norm": 1.1519258687122351, "kl": 0.0, "learning_rate": 2.5e-07, "loss": 0.0, "reward": 0.541226252913475, "reward_std": 0.5189632624387741, "rewards/correct_code_reward_func": 0.2500000111758709, "rewards/len_reward_func": 0.29122625291347504, "step": 2 }, { "completion_length": 108.83333587646484, "epoch": 0.048, "grad_norm": 1.467116667329371, "kl": 0.00013637542724609375, "learning_rate": 3.75e-07, "loss": 0.0, "reward": 0.7587994039058685, "reward_std": 0.5140225142240524, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.21713273972272873, "step": 3 }, { "completion_length": 159.81250762939453, "epoch": 0.064, "grad_norm": 1.2682485669137435, "kl": 0.00018215179443359375, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.5178248882293701, "reward_std": 0.4526914358139038, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.3511582016944885, "step": 4 }, { "completion_length": 176.56250762939453, "epoch": 0.08, "grad_norm": 1.2033440109589688, "kl": 0.00014066696166992188, "learning_rate": 4.999157413258781e-07, "loss": 0.0, "reward": 0.32241350412368774, "reward_std": 0.32281263172626495, "rewards/correct_code_reward_func": 0.02083333395421505, "rewards/len_reward_func": 0.30158019065856934, "step": 5 }, { "completion_length": 124.87500762939453, "epoch": 0.096, "grad_norm": 1.5120707071506325, "kl": 0.00016808509826660156, "learning_rate": 4.996630220997057e-07, "loss": 0.0, "reward": 0.746085911989212, "reward_std": 0.5452268123626709, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.28775252401828766, "step": 6 }, { "completion_length": 169.9166717529297, "epoch": 0.112, "grad_norm": 0.9079518632617903, "kl": 0.00011348724365234375, "learning_rate": 4.992420126717784e-07, "loss": 0.0, "reward": 0.36989694088697433, "reward_std": 0.45903605222702026, "rewards/correct_code_reward_func": 0.125, "rewards/len_reward_func": 0.24489693343639374, "step": 7 }, { "completion_length": 219.43750762939453, "epoch": 0.128, "grad_norm": 1.2633142753352289, "kl": 0.0002155303955078125, "learning_rate": 4.986529968316653e-07, "loss": 0.0, "reward": 0.44794920086860657, "reward_std": 0.385338693857193, "rewards/correct_code_reward_func": 0.1250000037252903, "rewards/len_reward_func": 0.3229491859674454, "step": 8 }, { "completion_length": 227.91667938232422, "epoch": 0.144, "grad_norm": 1.0211344567101885, "kl": 0.00011777877807617188, "learning_rate": 4.978963716169165e-07, "loss": 0.0, "reward": 0.6235890090465546, "reward_std": 0.5187947303056717, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.31108900904655457, "step": 9 }, { "completion_length": 188.25000762939453, "epoch": 0.16, "grad_norm": 1.0353822839723037, "kl": 0.00011730194091796875, "learning_rate": 4.969726470454313e-07, "loss": 0.0, "reward": 0.6911160051822662, "reward_std": 0.5456923246383667, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.27444930374622345, "step": 10 }, { "completion_length": 168.27083587646484, "epoch": 0.176, "grad_norm": 1.7856755608823207, "kl": 0.00018310546875, "learning_rate": 4.958824457716706e-07, "loss": 0.0, "reward": 0.4588584154844284, "reward_std": 0.40716809034347534, "rewards/correct_code_reward_func": 0.1875, "rewards/len_reward_func": 0.271358385682106, "step": 11 }, { "completion_length": 203.08333587646484, "epoch": 0.192, "grad_norm": 0.9296992149271633, "kl": 0.00016641616821289062, "learning_rate": 4.946265026669454e-07, "loss": 0.0, "reward": 0.3501324951648712, "reward_std": 0.49003708362579346, "rewards/correct_code_reward_func": 0.1041666679084301, "rewards/len_reward_func": 0.245965838432312, "step": 12 }, { "completion_length": 115.66666793823242, "epoch": 0.208, "grad_norm": 1.4335533212366607, "kl": 0.00016570091247558594, "learning_rate": 4.932056643240618e-07, "loss": 0.0, "reward": 0.7853705883026123, "reward_std": 0.46111349761486053, "rewards/correct_code_reward_func": 0.5000000149011612, "rewards/len_reward_func": 0.2853705883026123, "step": 13 }, { "completion_length": 169.95833587646484, "epoch": 0.224, "grad_norm": 1.2723280538596287, "kl": 0.00021076202392578125, "learning_rate": 4.916208884866592e-07, "loss": 0.0, "reward": 0.5324039310216904, "reward_std": 0.5338821411132812, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.26157061755657196, "step": 14 }, { "completion_length": 154.58333587646484, "epoch": 0.24, "grad_norm": 1.2578666329332273, "kl": 0.00019168853759765625, "learning_rate": 4.898732434036243e-07, "loss": 0.0, "reward": 0.5949100255966187, "reward_std": 0.5048613250255585, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.28241002559661865, "step": 15 }, { "completion_length": 173.1875114440918, "epoch": 0.256, "grad_norm": 1.1230347862341579, "kl": 0.00029277801513671875, "learning_rate": 4.879639071090173e-07, "loss": 0.0, "reward": 0.4564344882965088, "reward_std": 0.4671656936407089, "rewards/correct_code_reward_func": 0.1666666679084301, "rewards/len_reward_func": 0.2897678166627884, "step": 16 }, { "completion_length": 169.375, "epoch": 0.272, "grad_norm": 1.3041956300758726, "kl": 0.0002574920654296875, "learning_rate": 4.858941666279955e-07, "loss": 0.0, "reward": 0.6347246468067169, "reward_std": 0.5289804339408875, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.2805579602718353, "step": 17 }, { "completion_length": 133.25000762939453, "epoch": 0.288, "grad_norm": 1.354822217310785, "kl": 0.0002689361572265625, "learning_rate": 4.836654171092682e-07, "loss": 0.0, "reward": 0.5779364109039307, "reward_std": 0.4782462567090988, "rewards/correct_code_reward_func": 0.2916666716337204, "rewards/len_reward_func": 0.2862697243690491, "step": 18 }, { "completion_length": 99.41667175292969, "epoch": 0.304, "grad_norm": 1.4087777232916079, "kl": 0.00031757354736328125, "learning_rate": 4.812791608846709e-07, "loss": 0.0, "reward": 0.5035808980464935, "reward_std": 0.46289560198783875, "rewards/correct_code_reward_func": 0.229166679084301, "rewards/len_reward_func": 0.27441420406103134, "step": 19 }, { "completion_length": 170.7291717529297, "epoch": 0.32, "grad_norm": 0.9923230664440412, "kl": 0.00028705596923828125, "learning_rate": 4.787370064564882e-07, "loss": 0.0, "reward": 0.5567075908184052, "reward_std": 0.44439028203487396, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.34837424755096436, "step": 20 }, { "completion_length": 124.72917175292969, "epoch": 0.336, "grad_norm": 1.2245791922735345, "kl": 0.00035572052001953125, "learning_rate": 4.7604066741321253e-07, "loss": 0.0, "reward": 0.8560027182102203, "reward_std": 0.6356588900089264, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.31433598697185516, "step": 21 }, { "completion_length": 123.64583969116211, "epoch": 0.352, "grad_norm": 1.2080469812565267, "kl": 0.00035858154296875, "learning_rate": 4.731919612744659e-07, "loss": 0.0, "reward": 0.7242447733879089, "reward_std": 0.4742405414581299, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.32841143012046814, "step": 22 }, { "completion_length": 146.2916717529297, "epoch": 0.368, "grad_norm": 1.2440640880474592, "kl": 0.00040721893310546875, "learning_rate": 4.7019280826586604e-07, "loss": 0.0, "reward": 0.5270938575267792, "reward_std": 0.4260385036468506, "rewards/correct_code_reward_func": 0.2291666679084301, "rewards/len_reward_func": 0.2979271858930588, "step": 23 }, { "completion_length": 141.9166717529297, "epoch": 0.384, "grad_norm": 1.455943571941334, "kl": 0.0006427764892578125, "learning_rate": 4.6704523002466094e-07, "loss": 0.0, "reward": 0.5917265266180038, "reward_std": 0.47722122073173523, "rewards/correct_code_reward_func": 0.3333333358168602, "rewards/len_reward_func": 0.25839313119649887, "step": 24 }, { "completion_length": 240.85417938232422, "epoch": 0.4, "grad_norm": 0.8411889507435418, "kl": 0.0003604888916015625, "learning_rate": 4.6375134823700503e-07, "loss": 0.0, "reward": 0.3353981524705887, "reward_std": 0.351834774017334, "rewards/correct_code_reward_func": 0.0833333358168602, "rewards/len_reward_func": 0.2520648390054703, "step": 25 }, { "completion_length": 97.31250381469727, "epoch": 0.416, "grad_norm": 1.374585753278975, "kl": 0.0008258819580078125, "learning_rate": 4.603133832077953e-07, "loss": 0.0, "reward": 0.6881800889968872, "reward_std": 0.5626422464847565, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.2506800442934036, "step": 26 }, { "completion_length": 131.08333587646484, "epoch": 0.432, "grad_norm": 1.5040369557196518, "kl": 0.0006847381591796875, "learning_rate": 4.5673365236403216e-07, "loss": 0.0, "reward": 0.6470239758491516, "reward_std": 0.39606642723083496, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.20952393114566803, "step": 27 }, { "completion_length": 198.06250762939453, "epoch": 0.448, "grad_norm": 1.1110007536297855, "kl": 0.00054168701171875, "learning_rate": 4.530145686927125e-07, "loss": 0.0, "reward": 0.5166794955730438, "reward_std": 0.504486620426178, "rewards/correct_code_reward_func": 0.2500000149011612, "rewards/len_reward_func": 0.2666794955730438, "step": 28 }, { "completion_length": 152.52083587646484, "epoch": 0.464, "grad_norm": 1.134262039216797, "kl": 0.00078582763671875, "learning_rate": 4.4915863911430897e-07, "loss": 0.0, "reward": 0.5144253522157669, "reward_std": 0.4733017832040787, "rewards/correct_code_reward_func": 0.1875000111758709, "rewards/len_reward_func": 0.3269253224134445, "step": 29 }, { "completion_length": 139.7916717529297, "epoch": 0.48, "grad_norm": 1.010573889887009, "kl": 0.0007152557373046875, "learning_rate": 4.45168462792932e-07, "loss": 0.0, "reward": 0.5882390439510345, "reward_std": 0.43310636281967163, "rewards/correct_code_reward_func": 0.2500000074505806, "rewards/len_reward_func": 0.33823904395103455, "step": 30 }, { "completion_length": 87.41666793823242, "epoch": 0.496, "grad_norm": 1.540244950569226, "kl": 0.0012340545654296875, "learning_rate": 4.4104672938431223e-07, "loss": 0.0, "reward": 0.7711681425571442, "reward_std": 0.4805651605129242, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.18783476203680038, "step": 31 }, { "completion_length": 101.43750381469727, "epoch": 0.512, "grad_norm": 2.3673085026520297, "kl": 0.0012607574462890625, "learning_rate": 4.367962172227866e-07, "loss": 0.0, "reward": 0.7279457449913025, "reward_std": 0.4627054035663605, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.2696124166250229, "step": 32 }, { "completion_length": 155.2291717529297, "epoch": 0.528, "grad_norm": 1.2624598609488873, "kl": 0.00139617919921875, "learning_rate": 4.324197914485075e-07, "loss": 0.0, "reward": 0.6401492655277252, "reward_std": 0.515736848115921, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.26514923572540283, "step": 33 }, { "completion_length": 252.91667938232422, "epoch": 0.544, "grad_norm": 1.043728438493038, "kl": 0.0008392333984375, "learning_rate": 4.2792040207614e-07, "loss": 0.0, "reward": 0.6339870393276215, "reward_std": 0.5688490867614746, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.30065372586250305, "step": 34 }, { "completion_length": 178.25, "epoch": 0.56, "grad_norm": 1.2442169258805433, "kl": 0.00205230712890625, "learning_rate": 4.2330108200634723e-07, "loss": 0.0, "reward": 0.43357332795858383, "reward_std": 0.3690243661403656, "rewards/correct_code_reward_func": 0.16666667722165585, "rewards/len_reward_func": 0.26690666377544403, "step": 35 }, { "completion_length": 150.1666717529297, "epoch": 0.576, "grad_norm": 1.0937981889230137, "kl": 0.0016021728515625, "learning_rate": 4.185649449814045e-07, "loss": 0.0, "reward": 0.8725252151489258, "reward_std": 0.5368492603302002, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.3308584541082382, "step": 36 }, { "completion_length": 74.41666793823242, "epoch": 0.592, "grad_norm": 1.4560552034278569, "kl": 0.0020904541015625, "learning_rate": 4.137151834863213e-07, "loss": 0.0, "reward": 0.7634576857089996, "reward_std": 0.5292592346668243, "rewards/correct_code_reward_func": 0.5416666716337204, "rewards/len_reward_func": 0.22179099917411804, "step": 37 }, { "completion_length": 111.77083587646484, "epoch": 0.608, "grad_norm": 1.6125607277054597, "kl": 0.002716064453125, "learning_rate": 4.087550665968846e-07, "loss": 0.0, "reward": 0.6047167330980301, "reward_std": 0.4415762424468994, "rewards/correct_code_reward_func": 0.2916666865348816, "rewards/len_reward_func": 0.3130500763654709, "step": 38 }, { "completion_length": 87.0625, "epoch": 0.624, "grad_norm": 2.0747921723056026, "kl": 0.0023193359375, "learning_rate": 4.036879377760752e-07, "loss": 0.0, "reward": 0.7261738479137421, "reward_std": 0.6433705389499664, "rewards/correct_code_reward_func": 0.520833358168602, "rewards/len_reward_func": 0.20534051209688187, "step": 39 }, { "completion_length": 128.0833396911621, "epoch": 0.64, "grad_norm": 1.352520841789316, "kl": 0.00229644775390625, "learning_rate": 3.9851721262034157e-07, "loss": 0.0, "reward": 0.49166351556777954, "reward_std": 0.4290030002593994, "rewards/correct_code_reward_func": 0.18750000558793545, "rewards/len_reward_func": 0.30416350066661835, "step": 40 }, { "completion_length": 117.33333587646484, "epoch": 0.656, "grad_norm": 1.5281074207353524, "kl": 0.003509521484375, "learning_rate": 3.932463765572505e-07, "loss": 0.0, "reward": 0.5800679922103882, "reward_std": 0.5416670143604279, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.2675679475069046, "step": 41 }, { "completion_length": 112.43750381469727, "epoch": 0.672, "grad_norm": 1.2084984435618142, "kl": 0.00252532958984375, "learning_rate": 3.8787898249606767e-07, "loss": 0.0, "reward": 0.42490366101264954, "reward_std": 0.46323399245738983, "rewards/correct_code_reward_func": 0.14583333395421505, "rewards/len_reward_func": 0.27907034754753113, "step": 42 }, { "completion_length": 56.85416793823242, "epoch": 0.688, "grad_norm": 1.8756323954488632, "kl": 0.00452423095703125, "learning_rate": 3.8241864843284964e-07, "loss": 0.0, "reward": 0.7274035811424255, "reward_std": 0.5209662765264511, "rewards/correct_code_reward_func": 0.5000000149011612, "rewards/len_reward_func": 0.22740358859300613, "step": 43 }, { "completion_length": 153.68750762939453, "epoch": 0.704, "grad_norm": 1.785627080388602, "kl": 0.0055084228515625, "learning_rate": 3.768690550116639e-07, "loss": 0.0, "reward": 0.49254634976387024, "reward_std": 0.4052678644657135, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.32587967813014984, "step": 44 }, { "completion_length": 170.1041717529297, "epoch": 0.72, "grad_norm": 1.2057879792669277, "kl": 0.0038299560546875, "learning_rate": 3.712339430435792e-07, "loss": 0.0, "reward": 0.5373264253139496, "reward_std": 0.4612013250589371, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.2664930745959282, "step": 45 }, { "completion_length": 122.79167175292969, "epoch": 0.736, "grad_norm": 1.23844328247912, "kl": 0.00384521484375, "learning_rate": 3.65517110985099e-07, "loss": 0.0, "reward": 0.6534424722194672, "reward_std": 0.5896010398864746, "rewards/correct_code_reward_func": 0.354166679084301, "rewards/len_reward_func": 0.29927581548690796, "step": 46 }, { "completion_length": 73.39583396911621, "epoch": 0.752, "grad_norm": 2.222315006145743, "kl": 0.0058135986328125, "learning_rate": 3.597224123777389e-07, "loss": 0.0, "reward": 0.7357015609741211, "reward_std": 0.5119403451681137, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.2773682177066803, "step": 47 }, { "completion_length": 75.54166793823242, "epoch": 0.768, "grad_norm": 1.9981519435567456, "kl": 0.0053863525390625, "learning_rate": 3.5385375325047163e-07, "loss": 0.0, "reward": 0.6428782939910889, "reward_std": 0.6202229559421539, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.24704494327306747, "step": 48 }, { "completion_length": 73.27083587646484, "epoch": 0.784, "grad_norm": 2.073070842958071, "kl": 0.00554656982421875, "learning_rate": 3.479150894867926e-07, "loss": 0.0, "reward": 0.8005061745643616, "reward_std": 0.5489170849323273, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.25883948802948, "step": 49 }, { "completion_length": 93.62500381469727, "epoch": 0.8, "grad_norm": 1.7280406240103203, "kl": 0.0070953369140625, "learning_rate": 3.4191042415818e-07, "loss": 0.0, "reward": 0.6382943987846375, "reward_std": 0.4014574736356735, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.26329439133405685, "step": 50 }, { "completion_length": 110.31250381469727, "epoch": 0.816, "grad_norm": 1.5732703630042588, "kl": 0.008453369140625, "learning_rate": 3.3584380482574717e-07, "loss": 0.0, "reward": 0.8389279842376709, "reward_std": 0.6495693922042847, "rewards/correct_code_reward_func": 0.5208333432674408, "rewards/len_reward_func": 0.31809471547603607, "step": 51 }, { "completion_length": 81.4375, "epoch": 0.832, "grad_norm": 1.3555162901411408, "kl": 0.0072479248046875, "learning_rate": 3.297193208119047e-07, "loss": 0.0, "reward": 0.7050519585609436, "reward_std": 0.522288054227829, "rewards/correct_code_reward_func": 0.4375000298023224, "rewards/len_reward_func": 0.2675519585609436, "step": 52 }, { "completion_length": 145.2291717529297, "epoch": 0.848, "grad_norm": 1.2256688073258564, "kl": 0.00726318359375, "learning_rate": 3.235411004438741e-07, "loss": 0.0, "reward": 0.6400169730186462, "reward_std": 0.5816708207130432, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.28585030883550644, "step": 53 }, { "completion_length": 120.20833587646484, "epoch": 0.864, "grad_norm": 1.8462631631415796, "kl": 0.0084991455078125, "learning_rate": 3.173133082709086e-07, "loss": 0.0, "reward": 0.643402487039566, "reward_std": 0.3417808264493942, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.31006917357444763, "step": 54 }, { "completion_length": 55.56250190734863, "epoch": 0.88, "grad_norm": 1.7370166581779802, "kl": 0.01177978515625, "learning_rate": 3.1104014225709784e-07, "loss": 0.0, "reward": 0.9137917459011078, "reward_std": 0.5003669559955597, "rewards/correct_code_reward_func": 0.583333358168602, "rewards/len_reward_func": 0.3304583728313446, "step": 55 }, { "completion_length": 189.25000762939453, "epoch": 0.896, "grad_norm": 1.2196760565152192, "kl": 0.0058441162109375, "learning_rate": 3.0472583095164873e-07, "loss": 0.0, "reward": 0.4673280417919159, "reward_std": 0.4577627182006836, "rewards/correct_code_reward_func": 0.1666666716337204, "rewards/len_reward_func": 0.3006613999605179, "step": 56 }, { "completion_length": 57.37500190734863, "epoch": 0.912, "grad_norm": 2.0919947468048976, "kl": 0.010162353515625, "learning_rate": 2.983746306385499e-07, "loss": 0.0, "reward": 0.6931174695491791, "reward_std": 0.5172313153743744, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.21395081281661987, "step": 57 }, { "completion_length": 86.00000190734863, "epoch": 0.928, "grad_norm": 1.5907089477428527, "kl": 0.0113677978515625, "learning_rate": 2.919908224675412e-07, "loss": 0.0, "reward": 0.5865814685821533, "reward_std": 0.5177368223667145, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.27408143877983093, "step": 58 }, { "completion_length": 90.72916793823242, "epoch": 0.944, "grad_norm": 1.1269292807249032, "kl": 0.00830078125, "learning_rate": 2.8557870956832133e-07, "loss": 0.0, "reward": 0.4935041069984436, "reward_std": 0.41843119263648987, "rewards/correct_code_reward_func": 0.2083333432674408, "rewards/len_reward_func": 0.285170778632164, "step": 59 }, { "completion_length": 85.60416793823242, "epoch": 0.96, "grad_norm": 2.320388470663489, "kl": 0.014678955078125, "learning_rate": 2.7914261414993976e-07, "loss": 0.0, "reward": 0.7554058134555817, "reward_std": 0.5069911777973175, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.3387391269207001, "step": 60 }, { "completion_length": 63.375, "epoch": 0.976, "grad_norm": 1.7319214973496064, "kl": 0.02532958984375, "learning_rate": 2.726868745873286e-07, "loss": 0.0, "reward": 0.7839343547821045, "reward_std": 0.6209487617015839, "rewards/correct_code_reward_func": 0.4791666716337204, "rewards/len_reward_func": 0.3047676384449005, "step": 61 }, { "completion_length": 87.14583587646484, "epoch": 0.992, "grad_norm": 1.8272498546531741, "kl": 0.0134735107421875, "learning_rate": 2.662158424969357e-07, "loss": 0.0, "reward": 0.8219521045684814, "reward_std": 0.6945097148418427, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.28028544783592224, "step": 62 }, { "completion_length": 55.66666793823242, "epoch": 1.0, "grad_norm": 1.8272498546531741, "kl": 0.02587890625, "learning_rate": 2.597338798034344e-07, "loss": 0.0, "reward": 0.713922381401062, "reward_std": 0.519837498664856, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.29725566506385803, "step": 63 }, { "completion_length": 88.75000381469727, "epoch": 1.016, "grad_norm": 1.6950346991160663, "kl": 0.0108642578125, "learning_rate": 2.532453557994827e-07, "loss": 0.0, "reward": 0.5927524715662003, "reward_std": 0.39128445088863373, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.21775247156620026, "step": 64 }, { "completion_length": 151.7291717529297, "epoch": 1.032, "grad_norm": 1.6408461481438466, "kl": 0.011138916015625, "learning_rate": 2.467546442005173e-07, "loss": 0.0, "reward": 0.6122622489929199, "reward_std": 0.5165137350559235, "rewards/correct_code_reward_func": 0.3125000149011612, "rewards/len_reward_func": 0.2997622489929199, "step": 65 }, { "completion_length": 104.85417175292969, "epoch": 1.048, "grad_norm": 1.1573620161491798, "kl": 0.01092529296875, "learning_rate": 2.4026612019656556e-07, "loss": 0.0, "reward": 0.8486100733280182, "reward_std": 0.3942585438489914, "rewards/correct_code_reward_func": 0.5, "rewards/len_reward_func": 0.348610058426857, "step": 66 }, { "completion_length": 62.47916793823242, "epoch": 1.064, "grad_norm": 2.1966023559129266, "kl": 0.018798828125, "learning_rate": 2.337841575030642e-07, "loss": 0.0, "reward": 0.8105108737945557, "reward_std": 0.4338831454515457, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.35217756032943726, "step": 67 }, { "completion_length": 74.95833587646484, "epoch": 1.08, "grad_norm": 1.796160832910341, "kl": 0.02294921875, "learning_rate": 2.2731312541267143e-07, "loss": 0.0, "reward": 0.549996554851532, "reward_std": 0.3687018007040024, "rewards/correct_code_reward_func": 0.2083333358168602, "rewards/len_reward_func": 0.3416632413864136, "step": 68 }, { "completion_length": 80.14583587646484, "epoch": 1.096, "grad_norm": 2.1344146728324653, "kl": 0.02447509765625, "learning_rate": 2.2085738585006021e-07, "loss": 0.0, "reward": 0.8650955259799957, "reward_std": 0.4139704555273056, "rewards/correct_code_reward_func": 0.5208333432674408, "rewards/len_reward_func": 0.34426216781139374, "step": 69 }, { "completion_length": 60.958335876464844, "epoch": 1.112, "grad_norm": 1.6686676921157912, "kl": 0.025634765625, "learning_rate": 2.1442129043167873e-07, "loss": 0.0, "reward": 0.6947443187236786, "reward_std": 0.5725615322589874, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.319744348526001, "step": 70 }, { "completion_length": 108.1875, "epoch": 1.1280000000000001, "grad_norm": 1.7272596794076989, "kl": 0.0130615234375, "learning_rate": 2.0800917753245875e-07, "loss": 0.0, "reward": 0.7587291896343231, "reward_std": 0.5232284665107727, "rewards/correct_code_reward_func": 0.4166666865348816, "rewards/len_reward_func": 0.3420625329017639, "step": 71 }, { "completion_length": 108.04167175292969, "epoch": 1.144, "grad_norm": 1.6272563745253346, "kl": 0.01654052734375, "learning_rate": 2.0162536936145008e-07, "loss": 0.0, "reward": 0.5046872794628143, "reward_std": 0.3378771096467972, "rewards/correct_code_reward_func": 0.1666666679084301, "rewards/len_reward_func": 0.33802059292793274, "step": 72 }, { "completion_length": 54.02083396911621, "epoch": 1.16, "grad_norm": 1.9418689539056528, "kl": 0.0308837890625, "learning_rate": 1.9527416904835132e-07, "loss": 0.0, "reward": 0.9055829644203186, "reward_std": 0.3730238378047943, "rewards/correct_code_reward_func": 0.5, "rewards/len_reward_func": 0.405582919716835, "step": 73 }, { "completion_length": 94.31250381469727, "epoch": 1.176, "grad_norm": 1.5576616620611914, "kl": 0.02215576171875, "learning_rate": 1.889598577429022e-07, "loss": 0.0, "reward": 0.9071804285049438, "reward_std": 0.44920457899570465, "rewards/correct_code_reward_func": 0.5000000298023224, "rewards/len_reward_func": 0.40718045830726624, "step": 74 }, { "completion_length": 53.79166793823242, "epoch": 1.192, "grad_norm": 2.3725141345867544, "kl": 0.03057861328125, "learning_rate": 1.8268669172909136e-07, "loss": 0.0, "reward": 0.9221459329128265, "reward_std": 0.4697086811065674, "rewards/correct_code_reward_func": 0.5000000298023224, "rewards/len_reward_func": 0.42214588820934296, "step": 75 }, { "completion_length": 89.79167175292969, "epoch": 1.208, "grad_norm": 2.003223060045919, "kl": 0.03094482421875, "learning_rate": 1.7645889955612592e-07, "loss": 0.0, "reward": 1.0163878798484802, "reward_std": 0.43504565954208374, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.3913878947496414, "step": 76 }, { "completion_length": 68.79166984558105, "epoch": 1.224, "grad_norm": 2.361523245499291, "kl": 0.0457763671875, "learning_rate": 1.7028067918809535e-07, "loss": 0.0, "reward": 0.7535229325294495, "reward_std": 0.47849828004837036, "rewards/correct_code_reward_func": 0.375, "rewards/len_reward_func": 0.3785228729248047, "step": 77 }, { "completion_length": 54.14583396911621, "epoch": 1.24, "grad_norm": 2.120116927446423, "kl": 0.0394287109375, "learning_rate": 1.6415619517425294e-07, "loss": 0.0, "reward": 0.8538325130939484, "reward_std": 0.44848716259002686, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.3746658265590668, "step": 78 }, { "completion_length": 89.0, "epoch": 1.256, "grad_norm": 1.2055136830985975, "kl": 0.0272216796875, "learning_rate": 1.5808957584181994e-07, "loss": 0.0, "reward": 0.755169004201889, "reward_std": 0.4014817923307419, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.40100236237049103, "step": 79 }, { "completion_length": 99.39583969116211, "epoch": 1.272, "grad_norm": 1.84690544945913, "kl": 0.024322509765625, "learning_rate": 1.5208491051320744e-07, "loss": 0.0, "reward": 0.7356246709823608, "reward_std": 0.47616493701934814, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.33979131281375885, "step": 80 }, { "completion_length": 73.04166793823242, "epoch": 1.288, "grad_norm": 1.7278725529442787, "kl": 0.0439453125, "learning_rate": 1.461462467495284e-07, "loss": 0.0, "reward": 0.7051982879638672, "reward_std": 0.48877203464508057, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.3926983177661896, "step": 81 }, { "completion_length": 59.354169845581055, "epoch": 1.304, "grad_norm": 2.077567652472909, "kl": 0.0345458984375, "learning_rate": 1.4027758762226107e-07, "loss": 0.0, "reward": 0.816185712814331, "reward_std": 0.4705541431903839, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.3370189964771271, "step": 82 }, { "completion_length": 81.58333587646484, "epoch": 1.32, "grad_norm": 1.609719907980881, "kl": 0.0234375, "learning_rate": 1.3448288901490092e-07, "loss": 0.0, "reward": 0.7908000648021698, "reward_std": 0.45585089921951294, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.374133437871933, "step": 83 }, { "completion_length": 87.33333587646484, "epoch": 1.336, "grad_norm": 1.6587537084233746, "kl": 0.02667236328125, "learning_rate": 1.2876605695642084e-07, "loss": 0.0, "reward": 0.6749401688575745, "reward_std": 0.42905712127685547, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.3207734525203705, "step": 84 }, { "completion_length": 95.20833587646484, "epoch": 1.3519999999999999, "grad_norm": 2.538472018686139, "kl": 0.02581787109375, "learning_rate": 1.231309449883361e-07, "loss": 0.0, "reward": 0.7594759464263916, "reward_std": 0.5746750831604004, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.3844759315252304, "step": 85 }, { "completion_length": 55.43750190734863, "epoch": 1.3679999999999999, "grad_norm": 1.797373425635401, "kl": 0.03289794921875, "learning_rate": 1.1758135156715041e-07, "loss": 0.0, "reward": 0.9961144328117371, "reward_std": 0.5648430436849594, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.37111443281173706, "step": 86 }, { "completion_length": 121.25000762939453, "epoch": 1.384, "grad_norm": 1.7119982491506713, "kl": 0.0286865234375, "learning_rate": 1.1212101750393235e-07, "loss": 0.0, "reward": 0.7243427634239197, "reward_std": 0.3805614560842514, "rewards/correct_code_reward_func": 0.3333333358168602, "rewards/len_reward_func": 0.39100944995880127, "step": 87 }, { "completion_length": 57.35416793823242, "epoch": 1.4, "grad_norm": 1.7713124187158098, "kl": 0.034912109375, "learning_rate": 1.0675362344274952e-07, "loss": 0.0, "reward": 0.7016758322715759, "reward_std": 0.5317542552947998, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.34750914573669434, "step": 88 }, { "completion_length": 59.0625, "epoch": 1.416, "grad_norm": 1.6492634665708499, "kl": 0.034423828125, "learning_rate": 1.0148278737965844e-07, "loss": 0.0, "reward": 0.7394144237041473, "reward_std": 0.4491709917783737, "rewards/correct_code_reward_func": 0.3541666716337204, "rewards/len_reward_func": 0.38524775207042694, "step": 89 }, { "completion_length": 48.6875, "epoch": 1.432, "grad_norm": 1.9432473699712165, "kl": 0.06494140625, "learning_rate": 9.631206222392479e-08, "loss": 0.0001, "reward": 0.8676341474056244, "reward_std": 0.3966159522533417, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.388467475771904, "step": 90 }, { "completion_length": 91.62500381469727, "epoch": 1.448, "grad_norm": 1.9189293687085252, "kl": 0.13482666015625, "learning_rate": 9.124493340311537e-08, "loss": 0.0001, "reward": 0.7231810688972473, "reward_std": 0.4981995224952698, "rewards/correct_code_reward_func": 0.3333333432674408, "rewards/len_reward_func": 0.3898477256298065, "step": 91 }, { "completion_length": 60.729169845581055, "epoch": 1.464, "grad_norm": 1.9825880271843388, "kl": 0.03424072265625, "learning_rate": 8.628481651367875e-08, "loss": 0.0, "reward": 0.8303024768829346, "reward_std": 0.40181903541088104, "rewards/correct_code_reward_func": 0.4375000149011612, "rewards/len_reward_func": 0.39280249178409576, "step": 92 }, { "completion_length": 58.22916793823242, "epoch": 1.48, "grad_norm": 1.8747344082688029, "kl": 0.0426025390625, "learning_rate": 8.143505501859551e-08, "loss": 0.0, "reward": 0.7909549474716187, "reward_std": 0.4536728262901306, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.33262157440185547, "step": 93 }, { "completion_length": 125.10417175292969, "epoch": 1.496, "grad_norm": 1.5754029745287528, "kl": 0.02886962890625, "learning_rate": 7.669891799365282e-08, "loss": 0.0, "reward": 0.6297820806503296, "reward_std": 0.5051470398902893, "rewards/correct_code_reward_func": 0.2708333432674408, "rewards/len_reward_func": 0.3589487075805664, "step": 94 }, { "completion_length": 89.27083587646484, "epoch": 1.512, "grad_norm": 1.698829198816419, "kl": 0.02362060546875, "learning_rate": 7.207959792385998e-08, "loss": 0.0, "reward": 0.7924558222293854, "reward_std": 0.42506614327430725, "rewards/correct_code_reward_func": 0.3541666865348816, "rewards/len_reward_func": 0.4382891356945038, "step": 95 }, { "completion_length": 82.18750381469727, "epoch": 1.528, "grad_norm": 1.4031599496951968, "kl": 0.03643798828125, "learning_rate": 6.758020855149249e-08, "loss": 0.0, "reward": 0.6851500123739243, "reward_std": 0.2974398583173752, "rewards/correct_code_reward_func": 0.25000000558793545, "rewards/len_reward_func": 0.43515002727508545, "step": 96 }, { "completion_length": 54.6875, "epoch": 1.544, "grad_norm": 1.4467008481635895, "kl": 0.039306640625, "learning_rate": 6.320378277721342e-08, "loss": 0.0, "reward": 0.7509966492652893, "reward_std": 0.3042096644639969, "rewards/correct_code_reward_func": 0.3125, "rewards/len_reward_func": 0.4384966343641281, "step": 97 }, { "completion_length": 68.08333587646484, "epoch": 1.56, "grad_norm": 2.082709482850275, "kl": 0.03460693359375, "learning_rate": 5.895327061568775e-08, "loss": 0.0, "reward": 0.7968247532844543, "reward_std": 0.36605267226696014, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.42182472348213196, "step": 98 }, { "completion_length": 56.020835876464844, "epoch": 1.576, "grad_norm": 2.726579074776626, "kl": 0.0662841796875, "learning_rate": 5.483153720706798e-08, "loss": 0.0001, "reward": 0.8111520707607269, "reward_std": 0.548240602016449, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.3944854289293289, "step": 99 }, { "completion_length": 54.25000190734863, "epoch": 1.592, "grad_norm": 2.079061824739654, "kl": 0.0452880859375, "learning_rate": 5.0841360885690996e-08, "loss": 0.0, "reward": 0.9174363613128662, "reward_std": 0.46667972207069397, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.375769704580307, "step": 100 }, { "completion_length": 65.72916793823242, "epoch": 1.608, "grad_norm": 1.5292386933354263, "kl": 0.04522705078125, "learning_rate": 4.698543130728755e-08, "loss": 0.0, "reward": 0.8213175535202026, "reward_std": 0.38392098248004913, "rewards/correct_code_reward_func": 0.458333358168602, "rewards/len_reward_func": 0.3629842549562454, "step": 101 }, { "completion_length": 67.77083587646484, "epoch": 1.624, "grad_norm": 1.352325105446135, "kl": 0.0390625, "learning_rate": 4.326634763596784e-08, "loss": 0.0, "reward": 0.7263242900371552, "reward_std": 0.37168650329113007, "rewards/correct_code_reward_func": 0.31250002048909664, "rewards/len_reward_func": 0.41382429003715515, "step": 102 }, { "completion_length": 64.10416793823242, "epoch": 1.6400000000000001, "grad_norm": 1.9987254276022863, "kl": 0.02880859375, "learning_rate": 3.968661679220467e-08, "loss": 0.0, "reward": 1.174392580986023, "reward_std": 0.4813085198402405, "rewards/correct_code_reward_func": 0.7500000298023224, "rewards/len_reward_func": 0.42439255118370056, "step": 103 }, { "completion_length": 57.437503814697266, "epoch": 1.6560000000000001, "grad_norm": 1.5506203528349733, "kl": 0.041015625, "learning_rate": 3.624865176299499e-08, "loss": 0.0, "reward": 0.9918626546859741, "reward_std": 0.5309067815542221, "rewards/correct_code_reward_func": 0.6666666865348816, "rewards/len_reward_func": 0.3251959830522537, "step": 104 }, { "completion_length": 114.50000762939453, "epoch": 1.6720000000000002, "grad_norm": 1.538301941895194, "kl": 0.0245361328125, "learning_rate": 3.295476997533905e-08, "loss": 0.0, "reward": 0.9100688099861145, "reward_std": 0.29824198782444, "rewards/correct_code_reward_func": 0.4583333432674408, "rewards/len_reward_func": 0.4517354816198349, "step": 105 }, { "completion_length": 129.81250381469727, "epoch": 1.688, "grad_norm": 1.3867754807731443, "kl": 0.0283203125, "learning_rate": 2.980719173413396e-08, "loss": 0.0, "reward": 0.818383663892746, "reward_std": 0.5115247815847397, "rewards/correct_code_reward_func": 0.4166666716337204, "rewards/len_reward_func": 0.4017169624567032, "step": 106 }, { "completion_length": 73.33333587646484, "epoch": 1.704, "grad_norm": 2.2267187145460765, "kl": 0.04461669921875, "learning_rate": 2.680803872553408e-08, "loss": 0.0, "reward": 0.8567679226398468, "reward_std": 0.51302769780159, "rewards/correct_code_reward_func": 0.4375, "rewards/len_reward_func": 0.4192679077386856, "step": 107 }, { "completion_length": 53.54166793823242, "epoch": 1.72, "grad_norm": 3.1940102299602953, "kl": 0.0521240234375, "learning_rate": 2.395933258678745e-08, "loss": 0.0001, "reward": 0.9940223693847656, "reward_std": 0.46572498977184296, "rewards/correct_code_reward_func": 0.6041666865348816, "rewards/len_reward_func": 0.3898557126522064, "step": 108 }, { "completion_length": 41.52083396911621, "epoch": 1.736, "grad_norm": 2.0727978566546295, "kl": 0.0655517578125, "learning_rate": 2.1262993543511715e-08, "loss": 0.0001, "reward": 0.9489125609397888, "reward_std": 0.5604254603385925, "rewards/correct_code_reward_func": 0.6250000298023224, "rewards/len_reward_func": 0.32391248643398285, "step": 109 }, { "completion_length": 106.08333587646484, "epoch": 1.752, "grad_norm": 2.3414859603625806, "kl": 0.03424072265625, "learning_rate": 1.872083911532907e-08, "loss": 0.0, "reward": 0.5710697174072266, "reward_std": 0.4303289204835892, "rewards/correct_code_reward_func": 0.1666666679084301, "rewards/len_reward_func": 0.4044030159711838, "step": 110 }, { "completion_length": 60.437503814697266, "epoch": 1.768, "grad_norm": 1.5494191116212308, "kl": 0.046875, "learning_rate": 1.6334582890731697e-08, "loss": 0.0, "reward": 1.0543819665908813, "reward_std": 0.4688963294029236, "rewards/correct_code_reward_func": 0.6666666865348816, "rewards/len_reward_func": 0.38771532475948334, "step": 111 }, { "completion_length": 139.43750381469727, "epoch": 1.784, "grad_norm": 1.8975149766982131, "kl": 0.0323486328125, "learning_rate": 1.4105833372004523e-08, "loss": 0.0, "reward": 0.7198583781719208, "reward_std": 0.2770904451608658, "rewards/correct_code_reward_func": 0.2708333395421505, "rewards/len_reward_func": 0.4490250498056412, "step": 112 }, { "completion_length": 71.87500190734863, "epoch": 1.8, "grad_norm": 1.8779975481307012, "kl": 0.0350341796875, "learning_rate": 1.2036092890982619e-08, "loss": 0.0, "reward": 0.6213224828243256, "reward_std": 0.39381173253059387, "rewards/correct_code_reward_func": 0.25, "rewards/len_reward_func": 0.3713224530220032, "step": 113 }, { "completion_length": 73.16666793823242, "epoch": 1.8159999999999998, "grad_norm": 1.625916920606493, "kl": 0.04345703125, "learning_rate": 1.0126756596375685e-08, "loss": 0.0, "reward": 0.8906111121177673, "reward_std": 0.5251133739948273, "rewards/correct_code_reward_func": 0.4791666865348816, "rewards/len_reward_func": 0.41144441068172455, "step": 114 }, { "completion_length": 39.85416793823242, "epoch": 1.8319999999999999, "grad_norm": 1.8155165345051183, "kl": 0.0440673828125, "learning_rate": 8.379111513340753e-09, "loss": 0.0, "reward": 0.8687795996665955, "reward_std": 0.4838385283946991, "rewards/correct_code_reward_func": 0.4583333358168602, "rewards/len_reward_func": 0.41044625639915466, "step": 115 }, { "completion_length": 75.58333396911621, "epoch": 1.8479999999999999, "grad_norm": 1.8222797961879316, "kl": 0.03985595703125, "learning_rate": 6.7943356759381785e-09, "loss": 0.0, "reward": 0.9320607483386993, "reward_std": 0.5384509861469269, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.39039406180381775, "step": 116 }, { "completion_length": 68.54166984558105, "epoch": 1.8639999999999999, "grad_norm": 2.0020075086567775, "kl": 0.031982421875, "learning_rate": 5.373497333054616e-09, "loss": 0.0, "reward": 0.9275134801864624, "reward_std": 0.4482097327709198, "rewards/correct_code_reward_func": 0.5000000298023224, "rewards/len_reward_func": 0.4275134950876236, "step": 117 }, { "completion_length": 73.91666793823242, "epoch": 1.88, "grad_norm": 1.7788611304062052, "kl": 0.03240966796875, "learning_rate": 4.117554228329406e-09, "loss": 0.0, "reward": 0.9304822385311127, "reward_std": 0.5174555033445358, "rewards/correct_code_reward_func": 0.5416666865348816, "rewards/len_reward_func": 0.38881558179855347, "step": 118 }, { "completion_length": 56.20833396911621, "epoch": 1.896, "grad_norm": 2.1126141119280257, "kl": 0.0341796875, "learning_rate": 3.0273529545687125e-09, "loss": 0.0, "reward": 0.7594221532344818, "reward_std": 0.480338990688324, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.3635888248682022, "step": 119 }, { "completion_length": 72.47916793823242, "epoch": 1.912, "grad_norm": 1.4598566413193612, "kl": 0.03466796875, "learning_rate": 2.1036283830834224e-09, "loss": 0.0, "reward": 0.7889427244663239, "reward_std": 0.48503294587135315, "rewards/correct_code_reward_func": 0.3958333432674408, "rewards/len_reward_func": 0.39310936629772186, "step": 120 }, { "completion_length": 40.85416793823242, "epoch": 1.928, "grad_norm": 2.335195303935002, "kl": 0.056640625, "learning_rate": 1.347003168334665e-09, "loss": 0.0001, "reward": 1.0662382543087006, "reward_std": 0.2768351137638092, "rewards/correct_code_reward_func": 0.6250000149011612, "rewards/len_reward_func": 0.44123825430870056, "step": 121 }, { "completion_length": 50.62500190734863, "epoch": 1.944, "grad_norm": 1.8386331097859265, "kl": 0.03173828125, "learning_rate": 7.579873282216598e-10, "loss": 0.0, "reward": 0.8906074166297913, "reward_std": 0.5252098143100739, "rewards/correct_code_reward_func": 0.5833333730697632, "rewards/len_reward_func": 0.30727406591176987, "step": 122 }, { "completion_length": 99.4375057220459, "epoch": 1.96, "grad_norm": 1.621045537411182, "kl": 0.0238037109375, "learning_rate": 3.3697790029424413e-10, "loss": 0.0, "reward": 0.9505272507667542, "reward_std": 0.5842320024967194, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.36719387769699097, "step": 123 }, { "completion_length": 63.000003814697266, "epoch": 1.976, "grad_norm": 2.157350197672568, "kl": 0.0465087890625, "learning_rate": 8.425867412190091e-11, "loss": 0.0, "reward": 0.9762873649597168, "reward_std": 0.5066816210746765, "rewards/correct_code_reward_func": 0.5833333432674408, "rewards/len_reward_func": 0.3929540067911148, "step": 124 }, { "completion_length": 126.97917175292969, "epoch": 1.992, "grad_norm": 1.7642641467833304, "kl": 0.02130126953125, "learning_rate": 0.0, "loss": 0.0, "reward": 0.7899810075759888, "reward_std": 0.38732415437698364, "rewards/correct_code_reward_func": 0.3750000149011612, "rewards/len_reward_func": 0.41498102247714996, "step": 125 }, { "epoch": 1.992, "step": 125, "total_flos": 0.0, "train_loss": 1.9367338650191358e-05, "train_runtime": 3648.0047, "train_samples_per_second": 0.206, "train_steps_per_second": 0.034 } ], "logging_steps": 1, "max_steps": 125, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }