|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.992, |
|
"eval_steps": 500, |
|
"global_step": 125, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 140.9583396911621, |
|
"epoch": 0.016, |
|
"grad_norm": 1.4258953228320013, |
|
"kl": 0.0, |
|
"learning_rate": 1.25e-07, |
|
"loss": 0.0, |
|
"reward": 0.5152640044689178, |
|
"reward_std": 0.5508254170417786, |
|
"rewards/correct_code_reward_func": 0.2291666716337204, |
|
"rewards/len_reward_func": 0.28609737753868103, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 131.50000762939453, |
|
"epoch": 0.032, |
|
"grad_norm": 1.1519258687122351, |
|
"kl": 0.0, |
|
"learning_rate": 2.5e-07, |
|
"loss": 0.0, |
|
"reward": 0.541226252913475, |
|
"reward_std": 0.5189632624387741, |
|
"rewards/correct_code_reward_func": 0.2500000111758709, |
|
"rewards/len_reward_func": 0.29122625291347504, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 108.83333587646484, |
|
"epoch": 0.048, |
|
"grad_norm": 1.467116667329371, |
|
"kl": 0.00013637542724609375, |
|
"learning_rate": 3.75e-07, |
|
"loss": 0.0, |
|
"reward": 0.7587994039058685, |
|
"reward_std": 0.5140225142240524, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.21713273972272873, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 159.81250762939453, |
|
"epoch": 0.064, |
|
"grad_norm": 1.2682485669137435, |
|
"kl": 0.00018215179443359375, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 0.5178248882293701, |
|
"reward_std": 0.4526914358139038, |
|
"rewards/correct_code_reward_func": 0.1666666716337204, |
|
"rewards/len_reward_func": 0.3511582016944885, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 176.56250762939453, |
|
"epoch": 0.08, |
|
"grad_norm": 1.2033440109589688, |
|
"kl": 0.00014066696166992188, |
|
"learning_rate": 4.999157413258781e-07, |
|
"loss": 0.0, |
|
"reward": 0.32241350412368774, |
|
"reward_std": 0.32281263172626495, |
|
"rewards/correct_code_reward_func": 0.02083333395421505, |
|
"rewards/len_reward_func": 0.30158019065856934, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 124.87500762939453, |
|
"epoch": 0.096, |
|
"grad_norm": 1.5120707071506325, |
|
"kl": 0.00016808509826660156, |
|
"learning_rate": 4.996630220997057e-07, |
|
"loss": 0.0, |
|
"reward": 0.746085911989212, |
|
"reward_std": 0.5452268123626709, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.28775252401828766, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 169.9166717529297, |
|
"epoch": 0.112, |
|
"grad_norm": 0.9079518632617903, |
|
"kl": 0.00011348724365234375, |
|
"learning_rate": 4.992420126717784e-07, |
|
"loss": 0.0, |
|
"reward": 0.36989694088697433, |
|
"reward_std": 0.45903605222702026, |
|
"rewards/correct_code_reward_func": 0.125, |
|
"rewards/len_reward_func": 0.24489693343639374, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 219.43750762939453, |
|
"epoch": 0.128, |
|
"grad_norm": 1.2633142753352289, |
|
"kl": 0.0002155303955078125, |
|
"learning_rate": 4.986529968316653e-07, |
|
"loss": 0.0, |
|
"reward": 0.44794920086860657, |
|
"reward_std": 0.385338693857193, |
|
"rewards/correct_code_reward_func": 0.1250000037252903, |
|
"rewards/len_reward_func": 0.3229491859674454, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 227.91667938232422, |
|
"epoch": 0.144, |
|
"grad_norm": 1.0211344567101885, |
|
"kl": 0.00011777877807617188, |
|
"learning_rate": 4.978963716169165e-07, |
|
"loss": 0.0, |
|
"reward": 0.6235890090465546, |
|
"reward_std": 0.5187947303056717, |
|
"rewards/correct_code_reward_func": 0.3125, |
|
"rewards/len_reward_func": 0.31108900904655457, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 188.25000762939453, |
|
"epoch": 0.16, |
|
"grad_norm": 1.0353822839723037, |
|
"kl": 0.00011730194091796875, |
|
"learning_rate": 4.969726470454313e-07, |
|
"loss": 0.0, |
|
"reward": 0.6911160051822662, |
|
"reward_std": 0.5456923246383667, |
|
"rewards/correct_code_reward_func": 0.4166666865348816, |
|
"rewards/len_reward_func": 0.27444930374622345, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 168.27083587646484, |
|
"epoch": 0.176, |
|
"grad_norm": 1.7856755608823207, |
|
"kl": 0.00018310546875, |
|
"learning_rate": 4.958824457716706e-07, |
|
"loss": 0.0, |
|
"reward": 0.4588584154844284, |
|
"reward_std": 0.40716809034347534, |
|
"rewards/correct_code_reward_func": 0.1875, |
|
"rewards/len_reward_func": 0.271358385682106, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 203.08333587646484, |
|
"epoch": 0.192, |
|
"grad_norm": 0.9296992149271633, |
|
"kl": 0.00016641616821289062, |
|
"learning_rate": 4.946265026669454e-07, |
|
"loss": 0.0, |
|
"reward": 0.3501324951648712, |
|
"reward_std": 0.49003708362579346, |
|
"rewards/correct_code_reward_func": 0.1041666679084301, |
|
"rewards/len_reward_func": 0.245965838432312, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 115.66666793823242, |
|
"epoch": 0.208, |
|
"grad_norm": 1.4335533212366607, |
|
"kl": 0.00016570091247558594, |
|
"learning_rate": 4.932056643240618e-07, |
|
"loss": 0.0, |
|
"reward": 0.7853705883026123, |
|
"reward_std": 0.46111349761486053, |
|
"rewards/correct_code_reward_func": 0.5000000149011612, |
|
"rewards/len_reward_func": 0.2853705883026123, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 169.95833587646484, |
|
"epoch": 0.224, |
|
"grad_norm": 1.2723280538596287, |
|
"kl": 0.00021076202392578125, |
|
"learning_rate": 4.916208884866592e-07, |
|
"loss": 0.0, |
|
"reward": 0.5324039310216904, |
|
"reward_std": 0.5338821411132812, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.26157061755657196, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 154.58333587646484, |
|
"epoch": 0.24, |
|
"grad_norm": 1.2578666329332273, |
|
"kl": 0.00019168853759765625, |
|
"learning_rate": 4.898732434036243e-07, |
|
"loss": 0.0, |
|
"reward": 0.5949100255966187, |
|
"reward_std": 0.5048613250255585, |
|
"rewards/correct_code_reward_func": 0.3125000149011612, |
|
"rewards/len_reward_func": 0.28241002559661865, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 173.1875114440918, |
|
"epoch": 0.256, |
|
"grad_norm": 1.1230347862341579, |
|
"kl": 0.00029277801513671875, |
|
"learning_rate": 4.879639071090173e-07, |
|
"loss": 0.0, |
|
"reward": 0.4564344882965088, |
|
"reward_std": 0.4671656936407089, |
|
"rewards/correct_code_reward_func": 0.1666666679084301, |
|
"rewards/len_reward_func": 0.2897678166627884, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 169.375, |
|
"epoch": 0.272, |
|
"grad_norm": 1.3041956300758726, |
|
"kl": 0.0002574920654296875, |
|
"learning_rate": 4.858941666279955e-07, |
|
"loss": 0.0, |
|
"reward": 0.6347246468067169, |
|
"reward_std": 0.5289804339408875, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.2805579602718353, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 133.25000762939453, |
|
"epoch": 0.288, |
|
"grad_norm": 1.354822217310785, |
|
"kl": 0.0002689361572265625, |
|
"learning_rate": 4.836654171092682e-07, |
|
"loss": 0.0, |
|
"reward": 0.5779364109039307, |
|
"reward_std": 0.4782462567090988, |
|
"rewards/correct_code_reward_func": 0.2916666716337204, |
|
"rewards/len_reward_func": 0.2862697243690491, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 99.41667175292969, |
|
"epoch": 0.304, |
|
"grad_norm": 1.4087777232916079, |
|
"kl": 0.00031757354736328125, |
|
"learning_rate": 4.812791608846709e-07, |
|
"loss": 0.0, |
|
"reward": 0.5035808980464935, |
|
"reward_std": 0.46289560198783875, |
|
"rewards/correct_code_reward_func": 0.229166679084301, |
|
"rewards/len_reward_func": 0.27441420406103134, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 170.7291717529297, |
|
"epoch": 0.32, |
|
"grad_norm": 0.9923230664440412, |
|
"kl": 0.00028705596923828125, |
|
"learning_rate": 4.787370064564882e-07, |
|
"loss": 0.0, |
|
"reward": 0.5567075908184052, |
|
"reward_std": 0.44439028203487396, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.34837424755096436, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 124.72917175292969, |
|
"epoch": 0.336, |
|
"grad_norm": 1.2245791922735345, |
|
"kl": 0.00035572052001953125, |
|
"learning_rate": 4.7604066741321253e-07, |
|
"loss": 0.0, |
|
"reward": 0.8560027182102203, |
|
"reward_std": 0.6356588900089264, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.31433598697185516, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 123.64583969116211, |
|
"epoch": 0.352, |
|
"grad_norm": 1.2080469812565267, |
|
"kl": 0.00035858154296875, |
|
"learning_rate": 4.731919612744659e-07, |
|
"loss": 0.0, |
|
"reward": 0.7242447733879089, |
|
"reward_std": 0.4742405414581299, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.32841143012046814, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 146.2916717529297, |
|
"epoch": 0.368, |
|
"grad_norm": 1.2440640880474592, |
|
"kl": 0.00040721893310546875, |
|
"learning_rate": 4.7019280826586604e-07, |
|
"loss": 0.0, |
|
"reward": 0.5270938575267792, |
|
"reward_std": 0.4260385036468506, |
|
"rewards/correct_code_reward_func": 0.2291666679084301, |
|
"rewards/len_reward_func": 0.2979271858930588, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 141.9166717529297, |
|
"epoch": 0.384, |
|
"grad_norm": 1.455943571941334, |
|
"kl": 0.0006427764892578125, |
|
"learning_rate": 4.6704523002466094e-07, |
|
"loss": 0.0, |
|
"reward": 0.5917265266180038, |
|
"reward_std": 0.47722122073173523, |
|
"rewards/correct_code_reward_func": 0.3333333358168602, |
|
"rewards/len_reward_func": 0.25839313119649887, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 240.85417938232422, |
|
"epoch": 0.4, |
|
"grad_norm": 0.8411889507435418, |
|
"kl": 0.0003604888916015625, |
|
"learning_rate": 4.6375134823700503e-07, |
|
"loss": 0.0, |
|
"reward": 0.3353981524705887, |
|
"reward_std": 0.351834774017334, |
|
"rewards/correct_code_reward_func": 0.0833333358168602, |
|
"rewards/len_reward_func": 0.2520648390054703, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 97.31250381469727, |
|
"epoch": 0.416, |
|
"grad_norm": 1.374585753278975, |
|
"kl": 0.0008258819580078125, |
|
"learning_rate": 4.603133832077953e-07, |
|
"loss": 0.0, |
|
"reward": 0.6881800889968872, |
|
"reward_std": 0.5626422464847565, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.2506800442934036, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 131.08333587646484, |
|
"epoch": 0.432, |
|
"grad_norm": 1.5040369557196518, |
|
"kl": 0.0006847381591796875, |
|
"learning_rate": 4.5673365236403216e-07, |
|
"loss": 0.0, |
|
"reward": 0.6470239758491516, |
|
"reward_std": 0.39606642723083496, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.20952393114566803, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 198.06250762939453, |
|
"epoch": 0.448, |
|
"grad_norm": 1.1110007536297855, |
|
"kl": 0.00054168701171875, |
|
"learning_rate": 4.530145686927125e-07, |
|
"loss": 0.0, |
|
"reward": 0.5166794955730438, |
|
"reward_std": 0.504486620426178, |
|
"rewards/correct_code_reward_func": 0.2500000149011612, |
|
"rewards/len_reward_func": 0.2666794955730438, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 152.52083587646484, |
|
"epoch": 0.464, |
|
"grad_norm": 1.134262039216797, |
|
"kl": 0.00078582763671875, |
|
"learning_rate": 4.4915863911430897e-07, |
|
"loss": 0.0, |
|
"reward": 0.5144253522157669, |
|
"reward_std": 0.4733017832040787, |
|
"rewards/correct_code_reward_func": 0.1875000111758709, |
|
"rewards/len_reward_func": 0.3269253224134445, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 139.7916717529297, |
|
"epoch": 0.48, |
|
"grad_norm": 1.010573889887009, |
|
"kl": 0.0007152557373046875, |
|
"learning_rate": 4.45168462792932e-07, |
|
"loss": 0.0, |
|
"reward": 0.5882390439510345, |
|
"reward_std": 0.43310636281967163, |
|
"rewards/correct_code_reward_func": 0.2500000074505806, |
|
"rewards/len_reward_func": 0.33823904395103455, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 87.41666793823242, |
|
"epoch": 0.496, |
|
"grad_norm": 1.540244950569226, |
|
"kl": 0.0012340545654296875, |
|
"learning_rate": 4.4104672938431223e-07, |
|
"loss": 0.0, |
|
"reward": 0.7711681425571442, |
|
"reward_std": 0.4805651605129242, |
|
"rewards/correct_code_reward_func": 0.5833333432674408, |
|
"rewards/len_reward_func": 0.18783476203680038, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 101.43750381469727, |
|
"epoch": 0.512, |
|
"grad_norm": 2.3673085026520297, |
|
"kl": 0.0012607574462890625, |
|
"learning_rate": 4.367962172227866e-07, |
|
"loss": 0.0, |
|
"reward": 0.7279457449913025, |
|
"reward_std": 0.4627054035663605, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.2696124166250229, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 155.2291717529297, |
|
"epoch": 0.528, |
|
"grad_norm": 1.2624598609488873, |
|
"kl": 0.00139617919921875, |
|
"learning_rate": 4.324197914485075e-07, |
|
"loss": 0.0, |
|
"reward": 0.6401492655277252, |
|
"reward_std": 0.515736848115921, |
|
"rewards/correct_code_reward_func": 0.375, |
|
"rewards/len_reward_func": 0.26514923572540283, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 252.91667938232422, |
|
"epoch": 0.544, |
|
"grad_norm": 1.043728438493038, |
|
"kl": 0.0008392333984375, |
|
"learning_rate": 4.2792040207614e-07, |
|
"loss": 0.0, |
|
"reward": 0.6339870393276215, |
|
"reward_std": 0.5688490867614746, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.30065372586250305, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 178.25, |
|
"epoch": 0.56, |
|
"grad_norm": 1.2442169258805433, |
|
"kl": 0.00205230712890625, |
|
"learning_rate": 4.2330108200634723e-07, |
|
"loss": 0.0, |
|
"reward": 0.43357332795858383, |
|
"reward_std": 0.3690243661403656, |
|
"rewards/correct_code_reward_func": 0.16666667722165585, |
|
"rewards/len_reward_func": 0.26690666377544403, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 150.1666717529297, |
|
"epoch": 0.576, |
|
"grad_norm": 1.0937981889230137, |
|
"kl": 0.0016021728515625, |
|
"learning_rate": 4.185649449814045e-07, |
|
"loss": 0.0, |
|
"reward": 0.8725252151489258, |
|
"reward_std": 0.5368492603302002, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.3308584541082382, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 74.41666793823242, |
|
"epoch": 0.592, |
|
"grad_norm": 1.4560552034278569, |
|
"kl": 0.0020904541015625, |
|
"learning_rate": 4.137151834863213e-07, |
|
"loss": 0.0, |
|
"reward": 0.7634576857089996, |
|
"reward_std": 0.5292592346668243, |
|
"rewards/correct_code_reward_func": 0.5416666716337204, |
|
"rewards/len_reward_func": 0.22179099917411804, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 111.77083587646484, |
|
"epoch": 0.608, |
|
"grad_norm": 1.6125607277054597, |
|
"kl": 0.002716064453125, |
|
"learning_rate": 4.087550665968846e-07, |
|
"loss": 0.0, |
|
"reward": 0.6047167330980301, |
|
"reward_std": 0.4415762424468994, |
|
"rewards/correct_code_reward_func": 0.2916666865348816, |
|
"rewards/len_reward_func": 0.3130500763654709, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 87.0625, |
|
"epoch": 0.624, |
|
"grad_norm": 2.0747921723056026, |
|
"kl": 0.0023193359375, |
|
"learning_rate": 4.036879377760752e-07, |
|
"loss": 0.0, |
|
"reward": 0.7261738479137421, |
|
"reward_std": 0.6433705389499664, |
|
"rewards/correct_code_reward_func": 0.520833358168602, |
|
"rewards/len_reward_func": 0.20534051209688187, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 128.0833396911621, |
|
"epoch": 0.64, |
|
"grad_norm": 1.352520841789316, |
|
"kl": 0.00229644775390625, |
|
"learning_rate": 3.9851721262034157e-07, |
|
"loss": 0.0, |
|
"reward": 0.49166351556777954, |
|
"reward_std": 0.4290030002593994, |
|
"rewards/correct_code_reward_func": 0.18750000558793545, |
|
"rewards/len_reward_func": 0.30416350066661835, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 117.33333587646484, |
|
"epoch": 0.656, |
|
"grad_norm": 1.5281074207353524, |
|
"kl": 0.003509521484375, |
|
"learning_rate": 3.932463765572505e-07, |
|
"loss": 0.0, |
|
"reward": 0.5800679922103882, |
|
"reward_std": 0.5416670143604279, |
|
"rewards/correct_code_reward_func": 0.3125000149011612, |
|
"rewards/len_reward_func": 0.2675679475069046, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 112.43750381469727, |
|
"epoch": 0.672, |
|
"grad_norm": 1.2084984435618142, |
|
"kl": 0.00252532958984375, |
|
"learning_rate": 3.8787898249606767e-07, |
|
"loss": 0.0, |
|
"reward": 0.42490366101264954, |
|
"reward_std": 0.46323399245738983, |
|
"rewards/correct_code_reward_func": 0.14583333395421505, |
|
"rewards/len_reward_func": 0.27907034754753113, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 56.85416793823242, |
|
"epoch": 0.688, |
|
"grad_norm": 1.8756323954488632, |
|
"kl": 0.00452423095703125, |
|
"learning_rate": 3.8241864843284964e-07, |
|
"loss": 0.0, |
|
"reward": 0.7274035811424255, |
|
"reward_std": 0.5209662765264511, |
|
"rewards/correct_code_reward_func": 0.5000000149011612, |
|
"rewards/len_reward_func": 0.22740358859300613, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 153.68750762939453, |
|
"epoch": 0.704, |
|
"grad_norm": 1.785627080388602, |
|
"kl": 0.0055084228515625, |
|
"learning_rate": 3.768690550116639e-07, |
|
"loss": 0.0, |
|
"reward": 0.49254634976387024, |
|
"reward_std": 0.4052678644657135, |
|
"rewards/correct_code_reward_func": 0.1666666716337204, |
|
"rewards/len_reward_func": 0.32587967813014984, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 170.1041717529297, |
|
"epoch": 0.72, |
|
"grad_norm": 1.2057879792669277, |
|
"kl": 0.0038299560546875, |
|
"learning_rate": 3.712339430435792e-07, |
|
"loss": 0.0, |
|
"reward": 0.5373264253139496, |
|
"reward_std": 0.4612013250589371, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.2664930745959282, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 122.79167175292969, |
|
"epoch": 0.736, |
|
"grad_norm": 1.23844328247912, |
|
"kl": 0.00384521484375, |
|
"learning_rate": 3.65517110985099e-07, |
|
"loss": 0.0, |
|
"reward": 0.6534424722194672, |
|
"reward_std": 0.5896010398864746, |
|
"rewards/correct_code_reward_func": 0.354166679084301, |
|
"rewards/len_reward_func": 0.29927581548690796, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 73.39583396911621, |
|
"epoch": 0.752, |
|
"grad_norm": 2.222315006145743, |
|
"kl": 0.0058135986328125, |
|
"learning_rate": 3.597224123777389e-07, |
|
"loss": 0.0, |
|
"reward": 0.7357015609741211, |
|
"reward_std": 0.5119403451681137, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.2773682177066803, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 75.54166793823242, |
|
"epoch": 0.768, |
|
"grad_norm": 1.9981519435567456, |
|
"kl": 0.0053863525390625, |
|
"learning_rate": 3.5385375325047163e-07, |
|
"loss": 0.0, |
|
"reward": 0.6428782939910889, |
|
"reward_std": 0.6202229559421539, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.24704494327306747, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 73.27083587646484, |
|
"epoch": 0.784, |
|
"grad_norm": 2.073070842958071, |
|
"kl": 0.00554656982421875, |
|
"learning_rate": 3.479150894867926e-07, |
|
"loss": 0.0, |
|
"reward": 0.8005061745643616, |
|
"reward_std": 0.5489170849323273, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.25883948802948, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 93.62500381469727, |
|
"epoch": 0.8, |
|
"grad_norm": 1.7280406240103203, |
|
"kl": 0.0070953369140625, |
|
"learning_rate": 3.4191042415818e-07, |
|
"loss": 0.0, |
|
"reward": 0.6382943987846375, |
|
"reward_std": 0.4014574736356735, |
|
"rewards/correct_code_reward_func": 0.3750000149011612, |
|
"rewards/len_reward_func": 0.26329439133405685, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 110.31250381469727, |
|
"epoch": 0.816, |
|
"grad_norm": 1.5732703630042588, |
|
"kl": 0.008453369140625, |
|
"learning_rate": 3.3584380482574717e-07, |
|
"loss": 0.0, |
|
"reward": 0.8389279842376709, |
|
"reward_std": 0.6495693922042847, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.31809471547603607, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 81.4375, |
|
"epoch": 0.832, |
|
"grad_norm": 1.3555162901411408, |
|
"kl": 0.0072479248046875, |
|
"learning_rate": 3.297193208119047e-07, |
|
"loss": 0.0, |
|
"reward": 0.7050519585609436, |
|
"reward_std": 0.522288054227829, |
|
"rewards/correct_code_reward_func": 0.4375000298023224, |
|
"rewards/len_reward_func": 0.2675519585609436, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 145.2291717529297, |
|
"epoch": 0.848, |
|
"grad_norm": 1.2256688073258564, |
|
"kl": 0.00726318359375, |
|
"learning_rate": 3.235411004438741e-07, |
|
"loss": 0.0, |
|
"reward": 0.6400169730186462, |
|
"reward_std": 0.5816708207130432, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.28585030883550644, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 120.20833587646484, |
|
"epoch": 0.864, |
|
"grad_norm": 1.8462631631415796, |
|
"kl": 0.0084991455078125, |
|
"learning_rate": 3.173133082709086e-07, |
|
"loss": 0.0, |
|
"reward": 0.643402487039566, |
|
"reward_std": 0.3417808264493942, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.31006917357444763, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 55.56250190734863, |
|
"epoch": 0.88, |
|
"grad_norm": 1.7370166581779802, |
|
"kl": 0.01177978515625, |
|
"learning_rate": 3.1104014225709784e-07, |
|
"loss": 0.0, |
|
"reward": 0.9137917459011078, |
|
"reward_std": 0.5003669559955597, |
|
"rewards/correct_code_reward_func": 0.583333358168602, |
|
"rewards/len_reward_func": 0.3304583728313446, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 189.25000762939453, |
|
"epoch": 0.896, |
|
"grad_norm": 1.2196760565152192, |
|
"kl": 0.0058441162109375, |
|
"learning_rate": 3.0472583095164873e-07, |
|
"loss": 0.0, |
|
"reward": 0.4673280417919159, |
|
"reward_std": 0.4577627182006836, |
|
"rewards/correct_code_reward_func": 0.1666666716337204, |
|
"rewards/len_reward_func": 0.3006613999605179, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 57.37500190734863, |
|
"epoch": 0.912, |
|
"grad_norm": 2.0919947468048976, |
|
"kl": 0.010162353515625, |
|
"learning_rate": 2.983746306385499e-07, |
|
"loss": 0.0, |
|
"reward": 0.6931174695491791, |
|
"reward_std": 0.5172313153743744, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.21395081281661987, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 86.00000190734863, |
|
"epoch": 0.928, |
|
"grad_norm": 1.5907089477428527, |
|
"kl": 0.0113677978515625, |
|
"learning_rate": 2.919908224675412e-07, |
|
"loss": 0.0, |
|
"reward": 0.5865814685821533, |
|
"reward_std": 0.5177368223667145, |
|
"rewards/correct_code_reward_func": 0.3125000149011612, |
|
"rewards/len_reward_func": 0.27408143877983093, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 90.72916793823242, |
|
"epoch": 0.944, |
|
"grad_norm": 1.1269292807249032, |
|
"kl": 0.00830078125, |
|
"learning_rate": 2.8557870956832133e-07, |
|
"loss": 0.0, |
|
"reward": 0.4935041069984436, |
|
"reward_std": 0.41843119263648987, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.285170778632164, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 85.60416793823242, |
|
"epoch": 0.96, |
|
"grad_norm": 2.320388470663489, |
|
"kl": 0.014678955078125, |
|
"learning_rate": 2.7914261414993976e-07, |
|
"loss": 0.0, |
|
"reward": 0.7554058134555817, |
|
"reward_std": 0.5069911777973175, |
|
"rewards/correct_code_reward_func": 0.4166666716337204, |
|
"rewards/len_reward_func": 0.3387391269207001, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 63.375, |
|
"epoch": 0.976, |
|
"grad_norm": 1.7319214973496064, |
|
"kl": 0.02532958984375, |
|
"learning_rate": 2.726868745873286e-07, |
|
"loss": 0.0, |
|
"reward": 0.7839343547821045, |
|
"reward_std": 0.6209487617015839, |
|
"rewards/correct_code_reward_func": 0.4791666716337204, |
|
"rewards/len_reward_func": 0.3047676384449005, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 87.14583587646484, |
|
"epoch": 0.992, |
|
"grad_norm": 1.8272498546531741, |
|
"kl": 0.0134735107421875, |
|
"learning_rate": 2.662158424969357e-07, |
|
"loss": 0.0, |
|
"reward": 0.8219521045684814, |
|
"reward_std": 0.6945097148418427, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.28028544783592224, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 55.66666793823242, |
|
"epoch": 1.0, |
|
"grad_norm": 1.8272498546531741, |
|
"kl": 0.02587890625, |
|
"learning_rate": 2.597338798034344e-07, |
|
"loss": 0.0, |
|
"reward": 0.713922381401062, |
|
"reward_std": 0.519837498664856, |
|
"rewards/correct_code_reward_func": 0.4166666865348816, |
|
"rewards/len_reward_func": 0.29725566506385803, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 88.75000381469727, |
|
"epoch": 1.016, |
|
"grad_norm": 1.6950346991160663, |
|
"kl": 0.0108642578125, |
|
"learning_rate": 2.532453557994827e-07, |
|
"loss": 0.0, |
|
"reward": 0.5927524715662003, |
|
"reward_std": 0.39128445088863373, |
|
"rewards/correct_code_reward_func": 0.3750000149011612, |
|
"rewards/len_reward_func": 0.21775247156620026, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 151.7291717529297, |
|
"epoch": 1.032, |
|
"grad_norm": 1.6408461481438466, |
|
"kl": 0.011138916015625, |
|
"learning_rate": 2.467546442005173e-07, |
|
"loss": 0.0, |
|
"reward": 0.6122622489929199, |
|
"reward_std": 0.5165137350559235, |
|
"rewards/correct_code_reward_func": 0.3125000149011612, |
|
"rewards/len_reward_func": 0.2997622489929199, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 104.85417175292969, |
|
"epoch": 1.048, |
|
"grad_norm": 1.1573620161491798, |
|
"kl": 0.01092529296875, |
|
"learning_rate": 2.4026612019656556e-07, |
|
"loss": 0.0, |
|
"reward": 0.8486100733280182, |
|
"reward_std": 0.3942585438489914, |
|
"rewards/correct_code_reward_func": 0.5, |
|
"rewards/len_reward_func": 0.348610058426857, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 62.47916793823242, |
|
"epoch": 1.064, |
|
"grad_norm": 2.1966023559129266, |
|
"kl": 0.018798828125, |
|
"learning_rate": 2.337841575030642e-07, |
|
"loss": 0.0, |
|
"reward": 0.8105108737945557, |
|
"reward_std": 0.4338831454515457, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.35217756032943726, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 74.95833587646484, |
|
"epoch": 1.08, |
|
"grad_norm": 1.796160832910341, |
|
"kl": 0.02294921875, |
|
"learning_rate": 2.2731312541267143e-07, |
|
"loss": 0.0, |
|
"reward": 0.549996554851532, |
|
"reward_std": 0.3687018007040024, |
|
"rewards/correct_code_reward_func": 0.2083333358168602, |
|
"rewards/len_reward_func": 0.3416632413864136, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 80.14583587646484, |
|
"epoch": 1.096, |
|
"grad_norm": 2.1344146728324653, |
|
"kl": 0.02447509765625, |
|
"learning_rate": 2.2085738585006021e-07, |
|
"loss": 0.0, |
|
"reward": 0.8650955259799957, |
|
"reward_std": 0.4139704555273056, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.34426216781139374, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 60.958335876464844, |
|
"epoch": 1.112, |
|
"grad_norm": 1.6686676921157912, |
|
"kl": 0.025634765625, |
|
"learning_rate": 2.1442129043167873e-07, |
|
"loss": 0.0, |
|
"reward": 0.6947443187236786, |
|
"reward_std": 0.5725615322589874, |
|
"rewards/correct_code_reward_func": 0.375, |
|
"rewards/len_reward_func": 0.319744348526001, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 108.1875, |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 1.7272596794076989, |
|
"kl": 0.0130615234375, |
|
"learning_rate": 2.0800917753245875e-07, |
|
"loss": 0.0, |
|
"reward": 0.7587291896343231, |
|
"reward_std": 0.5232284665107727, |
|
"rewards/correct_code_reward_func": 0.4166666865348816, |
|
"rewards/len_reward_func": 0.3420625329017639, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 108.04167175292969, |
|
"epoch": 1.144, |
|
"grad_norm": 1.6272563745253346, |
|
"kl": 0.01654052734375, |
|
"learning_rate": 2.0162536936145008e-07, |
|
"loss": 0.0, |
|
"reward": 0.5046872794628143, |
|
"reward_std": 0.3378771096467972, |
|
"rewards/correct_code_reward_func": 0.1666666679084301, |
|
"rewards/len_reward_func": 0.33802059292793274, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 54.02083396911621, |
|
"epoch": 1.16, |
|
"grad_norm": 1.9418689539056528, |
|
"kl": 0.0308837890625, |
|
"learning_rate": 1.9527416904835132e-07, |
|
"loss": 0.0, |
|
"reward": 0.9055829644203186, |
|
"reward_std": 0.3730238378047943, |
|
"rewards/correct_code_reward_func": 0.5, |
|
"rewards/len_reward_func": 0.405582919716835, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 94.31250381469727, |
|
"epoch": 1.176, |
|
"grad_norm": 1.5576616620611914, |
|
"kl": 0.02215576171875, |
|
"learning_rate": 1.889598577429022e-07, |
|
"loss": 0.0, |
|
"reward": 0.9071804285049438, |
|
"reward_std": 0.44920457899570465, |
|
"rewards/correct_code_reward_func": 0.5000000298023224, |
|
"rewards/len_reward_func": 0.40718045830726624, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 53.79166793823242, |
|
"epoch": 1.192, |
|
"grad_norm": 2.3725141345867544, |
|
"kl": 0.03057861328125, |
|
"learning_rate": 1.8268669172909136e-07, |
|
"loss": 0.0, |
|
"reward": 0.9221459329128265, |
|
"reward_std": 0.4697086811065674, |
|
"rewards/correct_code_reward_func": 0.5000000298023224, |
|
"rewards/len_reward_func": 0.42214588820934296, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 89.79167175292969, |
|
"epoch": 1.208, |
|
"grad_norm": 2.003223060045919, |
|
"kl": 0.03094482421875, |
|
"learning_rate": 1.7645889955612592e-07, |
|
"loss": 0.0, |
|
"reward": 1.0163878798484802, |
|
"reward_std": 0.43504565954208374, |
|
"rewards/correct_code_reward_func": 0.6250000298023224, |
|
"rewards/len_reward_func": 0.3913878947496414, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 68.79166984558105, |
|
"epoch": 1.224, |
|
"grad_norm": 2.361523245499291, |
|
"kl": 0.0457763671875, |
|
"learning_rate": 1.7028067918809535e-07, |
|
"loss": 0.0, |
|
"reward": 0.7535229325294495, |
|
"reward_std": 0.47849828004837036, |
|
"rewards/correct_code_reward_func": 0.375, |
|
"rewards/len_reward_func": 0.3785228729248047, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 54.14583396911621, |
|
"epoch": 1.24, |
|
"grad_norm": 2.120116927446423, |
|
"kl": 0.0394287109375, |
|
"learning_rate": 1.6415619517425294e-07, |
|
"loss": 0.0, |
|
"reward": 0.8538325130939484, |
|
"reward_std": 0.44848716259002686, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.3746658265590668, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 89.0, |
|
"epoch": 1.256, |
|
"grad_norm": 1.2055136830985975, |
|
"kl": 0.0272216796875, |
|
"learning_rate": 1.5808957584181994e-07, |
|
"loss": 0.0, |
|
"reward": 0.755169004201889, |
|
"reward_std": 0.4014817923307419, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.40100236237049103, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 99.39583969116211, |
|
"epoch": 1.272, |
|
"grad_norm": 1.84690544945913, |
|
"kl": 0.024322509765625, |
|
"learning_rate": 1.5208491051320744e-07, |
|
"loss": 0.0, |
|
"reward": 0.7356246709823608, |
|
"reward_std": 0.47616493701934814, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.33979131281375885, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 73.04166793823242, |
|
"epoch": 1.288, |
|
"grad_norm": 1.7278725529442787, |
|
"kl": 0.0439453125, |
|
"learning_rate": 1.461462467495284e-07, |
|
"loss": 0.0, |
|
"reward": 0.7051982879638672, |
|
"reward_std": 0.48877203464508057, |
|
"rewards/correct_code_reward_func": 0.3125, |
|
"rewards/len_reward_func": 0.3926983177661896, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 59.354169845581055, |
|
"epoch": 1.304, |
|
"grad_norm": 2.077567652472909, |
|
"kl": 0.0345458984375, |
|
"learning_rate": 1.4027758762226107e-07, |
|
"loss": 0.0, |
|
"reward": 0.816185712814331, |
|
"reward_std": 0.4705541431903839, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.3370189964771271, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 81.58333587646484, |
|
"epoch": 1.32, |
|
"grad_norm": 1.609719907980881, |
|
"kl": 0.0234375, |
|
"learning_rate": 1.3448288901490092e-07, |
|
"loss": 0.0, |
|
"reward": 0.7908000648021698, |
|
"reward_std": 0.45585089921951294, |
|
"rewards/correct_code_reward_func": 0.4166666716337204, |
|
"rewards/len_reward_func": 0.374133437871933, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 87.33333587646484, |
|
"epoch": 1.336, |
|
"grad_norm": 1.6587537084233746, |
|
"kl": 0.02667236328125, |
|
"learning_rate": 1.2876605695642084e-07, |
|
"loss": 0.0, |
|
"reward": 0.6749401688575745, |
|
"reward_std": 0.42905712127685547, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.3207734525203705, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 95.20833587646484, |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 2.538472018686139, |
|
"kl": 0.02581787109375, |
|
"learning_rate": 1.231309449883361e-07, |
|
"loss": 0.0, |
|
"reward": 0.7594759464263916, |
|
"reward_std": 0.5746750831604004, |
|
"rewards/correct_code_reward_func": 0.3750000149011612, |
|
"rewards/len_reward_func": 0.3844759315252304, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 55.43750190734863, |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 1.797373425635401, |
|
"kl": 0.03289794921875, |
|
"learning_rate": 1.1758135156715041e-07, |
|
"loss": 0.0, |
|
"reward": 0.9961144328117371, |
|
"reward_std": 0.5648430436849594, |
|
"rewards/correct_code_reward_func": 0.6250000298023224, |
|
"rewards/len_reward_func": 0.37111443281173706, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 121.25000762939453, |
|
"epoch": 1.384, |
|
"grad_norm": 1.7119982491506713, |
|
"kl": 0.0286865234375, |
|
"learning_rate": 1.1212101750393235e-07, |
|
"loss": 0.0, |
|
"reward": 0.7243427634239197, |
|
"reward_std": 0.3805614560842514, |
|
"rewards/correct_code_reward_func": 0.3333333358168602, |
|
"rewards/len_reward_func": 0.39100944995880127, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 57.35416793823242, |
|
"epoch": 1.4, |
|
"grad_norm": 1.7713124187158098, |
|
"kl": 0.034912109375, |
|
"learning_rate": 1.0675362344274952e-07, |
|
"loss": 0.0, |
|
"reward": 0.7016758322715759, |
|
"reward_std": 0.5317542552947998, |
|
"rewards/correct_code_reward_func": 0.3541666865348816, |
|
"rewards/len_reward_func": 0.34750914573669434, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 59.0625, |
|
"epoch": 1.416, |
|
"grad_norm": 1.6492634665708499, |
|
"kl": 0.034423828125, |
|
"learning_rate": 1.0148278737965844e-07, |
|
"loss": 0.0, |
|
"reward": 0.7394144237041473, |
|
"reward_std": 0.4491709917783737, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.38524775207042694, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 48.6875, |
|
"epoch": 1.432, |
|
"grad_norm": 1.9432473699712165, |
|
"kl": 0.06494140625, |
|
"learning_rate": 9.631206222392479e-08, |
|
"loss": 0.0001, |
|
"reward": 0.8676341474056244, |
|
"reward_std": 0.3966159522533417, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.388467475771904, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 91.62500381469727, |
|
"epoch": 1.448, |
|
"grad_norm": 1.9189293687085252, |
|
"kl": 0.13482666015625, |
|
"learning_rate": 9.124493340311537e-08, |
|
"loss": 0.0001, |
|
"reward": 0.7231810688972473, |
|
"reward_std": 0.4981995224952698, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.3898477256298065, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 60.729169845581055, |
|
"epoch": 1.464, |
|
"grad_norm": 1.9825880271843388, |
|
"kl": 0.03424072265625, |
|
"learning_rate": 8.628481651367875e-08, |
|
"loss": 0.0, |
|
"reward": 0.8303024768829346, |
|
"reward_std": 0.40181903541088104, |
|
"rewards/correct_code_reward_func": 0.4375000149011612, |
|
"rewards/len_reward_func": 0.39280249178409576, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 58.22916793823242, |
|
"epoch": 1.48, |
|
"grad_norm": 1.8747344082688029, |
|
"kl": 0.0426025390625, |
|
"learning_rate": 8.143505501859551e-08, |
|
"loss": 0.0, |
|
"reward": 0.7909549474716187, |
|
"reward_std": 0.4536728262901306, |
|
"rewards/correct_code_reward_func": 0.458333358168602, |
|
"rewards/len_reward_func": 0.33262157440185547, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 125.10417175292969, |
|
"epoch": 1.496, |
|
"grad_norm": 1.5754029745287528, |
|
"kl": 0.02886962890625, |
|
"learning_rate": 7.669891799365282e-08, |
|
"loss": 0.0, |
|
"reward": 0.6297820806503296, |
|
"reward_std": 0.5051470398902893, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.3589487075805664, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 89.27083587646484, |
|
"epoch": 1.512, |
|
"grad_norm": 1.698829198816419, |
|
"kl": 0.02362060546875, |
|
"learning_rate": 7.207959792385998e-08, |
|
"loss": 0.0, |
|
"reward": 0.7924558222293854, |
|
"reward_std": 0.42506614327430725, |
|
"rewards/correct_code_reward_func": 0.3541666865348816, |
|
"rewards/len_reward_func": 0.4382891356945038, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 82.18750381469727, |
|
"epoch": 1.528, |
|
"grad_norm": 1.4031599496951968, |
|
"kl": 0.03643798828125, |
|
"learning_rate": 6.758020855149249e-08, |
|
"loss": 0.0, |
|
"reward": 0.6851500123739243, |
|
"reward_std": 0.2974398583173752, |
|
"rewards/correct_code_reward_func": 0.25000000558793545, |
|
"rewards/len_reward_func": 0.43515002727508545, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 54.6875, |
|
"epoch": 1.544, |
|
"grad_norm": 1.4467008481635895, |
|
"kl": 0.039306640625, |
|
"learning_rate": 6.320378277721342e-08, |
|
"loss": 0.0, |
|
"reward": 0.7509966492652893, |
|
"reward_std": 0.3042096644639969, |
|
"rewards/correct_code_reward_func": 0.3125, |
|
"rewards/len_reward_func": 0.4384966343641281, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 68.08333587646484, |
|
"epoch": 1.56, |
|
"grad_norm": 2.082709482850275, |
|
"kl": 0.03460693359375, |
|
"learning_rate": 5.895327061568775e-08, |
|
"loss": 0.0, |
|
"reward": 0.7968247532844543, |
|
"reward_std": 0.36605267226696014, |
|
"rewards/correct_code_reward_func": 0.3750000149011612, |
|
"rewards/len_reward_func": 0.42182472348213196, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 56.020835876464844, |
|
"epoch": 1.576, |
|
"grad_norm": 2.726579074776626, |
|
"kl": 0.0662841796875, |
|
"learning_rate": 5.483153720706798e-08, |
|
"loss": 0.0001, |
|
"reward": 0.8111520707607269, |
|
"reward_std": 0.548240602016449, |
|
"rewards/correct_code_reward_func": 0.4166666716337204, |
|
"rewards/len_reward_func": 0.3944854289293289, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 54.25000190734863, |
|
"epoch": 1.592, |
|
"grad_norm": 2.079061824739654, |
|
"kl": 0.0452880859375, |
|
"learning_rate": 5.0841360885690996e-08, |
|
"loss": 0.0, |
|
"reward": 0.9174363613128662, |
|
"reward_std": 0.46667972207069397, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.375769704580307, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 65.72916793823242, |
|
"epoch": 1.608, |
|
"grad_norm": 1.5292386933354263, |
|
"kl": 0.04522705078125, |
|
"learning_rate": 4.698543130728755e-08, |
|
"loss": 0.0, |
|
"reward": 0.8213175535202026, |
|
"reward_std": 0.38392098248004913, |
|
"rewards/correct_code_reward_func": 0.458333358168602, |
|
"rewards/len_reward_func": 0.3629842549562454, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 67.77083587646484, |
|
"epoch": 1.624, |
|
"grad_norm": 1.352325105446135, |
|
"kl": 0.0390625, |
|
"learning_rate": 4.326634763596784e-08, |
|
"loss": 0.0, |
|
"reward": 0.7263242900371552, |
|
"reward_std": 0.37168650329113007, |
|
"rewards/correct_code_reward_func": 0.31250002048909664, |
|
"rewards/len_reward_func": 0.41382429003715515, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 64.10416793823242, |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 1.9987254276022863, |
|
"kl": 0.02880859375, |
|
"learning_rate": 3.968661679220467e-08, |
|
"loss": 0.0, |
|
"reward": 1.174392580986023, |
|
"reward_std": 0.4813085198402405, |
|
"rewards/correct_code_reward_func": 0.7500000298023224, |
|
"rewards/len_reward_func": 0.42439255118370056, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 57.437503814697266, |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 1.5506203528349733, |
|
"kl": 0.041015625, |
|
"learning_rate": 3.624865176299499e-08, |
|
"loss": 0.0, |
|
"reward": 0.9918626546859741, |
|
"reward_std": 0.5309067815542221, |
|
"rewards/correct_code_reward_func": 0.6666666865348816, |
|
"rewards/len_reward_func": 0.3251959830522537, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 114.50000762939453, |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 1.538301941895194, |
|
"kl": 0.0245361328125, |
|
"learning_rate": 3.295476997533905e-08, |
|
"loss": 0.0, |
|
"reward": 0.9100688099861145, |
|
"reward_std": 0.29824198782444, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.4517354816198349, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 129.81250381469727, |
|
"epoch": 1.688, |
|
"grad_norm": 1.3867754807731443, |
|
"kl": 0.0283203125, |
|
"learning_rate": 2.980719173413396e-08, |
|
"loss": 0.0, |
|
"reward": 0.818383663892746, |
|
"reward_std": 0.5115247815847397, |
|
"rewards/correct_code_reward_func": 0.4166666716337204, |
|
"rewards/len_reward_func": 0.4017169624567032, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 73.33333587646484, |
|
"epoch": 1.704, |
|
"grad_norm": 2.2267187145460765, |
|
"kl": 0.04461669921875, |
|
"learning_rate": 2.680803872553408e-08, |
|
"loss": 0.0, |
|
"reward": 0.8567679226398468, |
|
"reward_std": 0.51302769780159, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.4192679077386856, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 53.54166793823242, |
|
"epoch": 1.72, |
|
"grad_norm": 3.1940102299602953, |
|
"kl": 0.0521240234375, |
|
"learning_rate": 2.395933258678745e-08, |
|
"loss": 0.0001, |
|
"reward": 0.9940223693847656, |
|
"reward_std": 0.46572498977184296, |
|
"rewards/correct_code_reward_func": 0.6041666865348816, |
|
"rewards/len_reward_func": 0.3898557126522064, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 41.52083396911621, |
|
"epoch": 1.736, |
|
"grad_norm": 2.0727978566546295, |
|
"kl": 0.0655517578125, |
|
"learning_rate": 2.1262993543511715e-08, |
|
"loss": 0.0001, |
|
"reward": 0.9489125609397888, |
|
"reward_std": 0.5604254603385925, |
|
"rewards/correct_code_reward_func": 0.6250000298023224, |
|
"rewards/len_reward_func": 0.32391248643398285, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 106.08333587646484, |
|
"epoch": 1.752, |
|
"grad_norm": 2.3414859603625806, |
|
"kl": 0.03424072265625, |
|
"learning_rate": 1.872083911532907e-08, |
|
"loss": 0.0, |
|
"reward": 0.5710697174072266, |
|
"reward_std": 0.4303289204835892, |
|
"rewards/correct_code_reward_func": 0.1666666679084301, |
|
"rewards/len_reward_func": 0.4044030159711838, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 60.437503814697266, |
|
"epoch": 1.768, |
|
"grad_norm": 1.5494191116212308, |
|
"kl": 0.046875, |
|
"learning_rate": 1.6334582890731697e-08, |
|
"loss": 0.0, |
|
"reward": 1.0543819665908813, |
|
"reward_std": 0.4688963294029236, |
|
"rewards/correct_code_reward_func": 0.6666666865348816, |
|
"rewards/len_reward_func": 0.38771532475948334, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 139.43750381469727, |
|
"epoch": 1.784, |
|
"grad_norm": 1.8975149766982131, |
|
"kl": 0.0323486328125, |
|
"learning_rate": 1.4105833372004523e-08, |
|
"loss": 0.0, |
|
"reward": 0.7198583781719208, |
|
"reward_std": 0.2770904451608658, |
|
"rewards/correct_code_reward_func": 0.2708333395421505, |
|
"rewards/len_reward_func": 0.4490250498056412, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 71.87500190734863, |
|
"epoch": 1.8, |
|
"grad_norm": 1.8779975481307012, |
|
"kl": 0.0350341796875, |
|
"learning_rate": 1.2036092890982619e-08, |
|
"loss": 0.0, |
|
"reward": 0.6213224828243256, |
|
"reward_std": 0.39381173253059387, |
|
"rewards/correct_code_reward_func": 0.25, |
|
"rewards/len_reward_func": 0.3713224530220032, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 73.16666793823242, |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 1.625916920606493, |
|
"kl": 0.04345703125, |
|
"learning_rate": 1.0126756596375685e-08, |
|
"loss": 0.0, |
|
"reward": 0.8906111121177673, |
|
"reward_std": 0.5251133739948273, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.41144441068172455, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 39.85416793823242, |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 1.8155165345051183, |
|
"kl": 0.0440673828125, |
|
"learning_rate": 8.379111513340753e-09, |
|
"loss": 0.0, |
|
"reward": 0.8687795996665955, |
|
"reward_std": 0.4838385283946991, |
|
"rewards/correct_code_reward_func": 0.4583333358168602, |
|
"rewards/len_reward_func": 0.41044625639915466, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 75.58333396911621, |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 1.8222797961879316, |
|
"kl": 0.03985595703125, |
|
"learning_rate": 6.7943356759381785e-09, |
|
"loss": 0.0, |
|
"reward": 0.9320607483386993, |
|
"reward_std": 0.5384509861469269, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.39039406180381775, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 68.54166984558105, |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 2.0020075086567775, |
|
"kl": 0.031982421875, |
|
"learning_rate": 5.373497333054616e-09, |
|
"loss": 0.0, |
|
"reward": 0.9275134801864624, |
|
"reward_std": 0.4482097327709198, |
|
"rewards/correct_code_reward_func": 0.5000000298023224, |
|
"rewards/len_reward_func": 0.4275134950876236, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 73.91666793823242, |
|
"epoch": 1.88, |
|
"grad_norm": 1.7788611304062052, |
|
"kl": 0.03240966796875, |
|
"learning_rate": 4.117554228329406e-09, |
|
"loss": 0.0, |
|
"reward": 0.9304822385311127, |
|
"reward_std": 0.5174555033445358, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.38881558179855347, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 56.20833396911621, |
|
"epoch": 1.896, |
|
"grad_norm": 2.1126141119280257, |
|
"kl": 0.0341796875, |
|
"learning_rate": 3.0273529545687125e-09, |
|
"loss": 0.0, |
|
"reward": 0.7594221532344818, |
|
"reward_std": 0.480338990688324, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.3635888248682022, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 72.47916793823242, |
|
"epoch": 1.912, |
|
"grad_norm": 1.4598566413193612, |
|
"kl": 0.03466796875, |
|
"learning_rate": 2.1036283830834224e-09, |
|
"loss": 0.0, |
|
"reward": 0.7889427244663239, |
|
"reward_std": 0.48503294587135315, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.39310936629772186, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 40.85416793823242, |
|
"epoch": 1.928, |
|
"grad_norm": 2.335195303935002, |
|
"kl": 0.056640625, |
|
"learning_rate": 1.347003168334665e-09, |
|
"loss": 0.0001, |
|
"reward": 1.0662382543087006, |
|
"reward_std": 0.2768351137638092, |
|
"rewards/correct_code_reward_func": 0.6250000149011612, |
|
"rewards/len_reward_func": 0.44123825430870056, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 50.62500190734863, |
|
"epoch": 1.944, |
|
"grad_norm": 1.8386331097859265, |
|
"kl": 0.03173828125, |
|
"learning_rate": 7.579873282216598e-10, |
|
"loss": 0.0, |
|
"reward": 0.8906074166297913, |
|
"reward_std": 0.5252098143100739, |
|
"rewards/correct_code_reward_func": 0.5833333730697632, |
|
"rewards/len_reward_func": 0.30727406591176987, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 99.4375057220459, |
|
"epoch": 1.96, |
|
"grad_norm": 1.621045537411182, |
|
"kl": 0.0238037109375, |
|
"learning_rate": 3.3697790029424413e-10, |
|
"loss": 0.0, |
|
"reward": 0.9505272507667542, |
|
"reward_std": 0.5842320024967194, |
|
"rewards/correct_code_reward_func": 0.5833333432674408, |
|
"rewards/len_reward_func": 0.36719387769699097, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 63.000003814697266, |
|
"epoch": 1.976, |
|
"grad_norm": 2.157350197672568, |
|
"kl": 0.0465087890625, |
|
"learning_rate": 8.425867412190091e-11, |
|
"loss": 0.0, |
|
"reward": 0.9762873649597168, |
|
"reward_std": 0.5066816210746765, |
|
"rewards/correct_code_reward_func": 0.5833333432674408, |
|
"rewards/len_reward_func": 0.3929540067911148, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 126.97917175292969, |
|
"epoch": 1.992, |
|
"grad_norm": 1.7642641467833304, |
|
"kl": 0.02130126953125, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 0.7899810075759888, |
|
"reward_std": 0.38732415437698364, |
|
"rewards/correct_code_reward_func": 0.3750000149011612, |
|
"rewards/len_reward_func": 0.41498102247714996, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"step": 125, |
|
"total_flos": 0.0, |
|
"train_loss": 1.9367338650191358e-05, |
|
"train_runtime": 3648.0047, |
|
"train_samples_per_second": 0.206, |
|
"train_steps_per_second": 0.034 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 125, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|