|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5714285714285714, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 3001.9584350585938, |
|
"epoch": 0.001142857142857143, |
|
"grad_norm": 0.19055147469043732, |
|
"kl": 0.0, |
|
"learning_rate": 2e-08, |
|
"loss": -0.0, |
|
"reward": -0.0029319413006305695, |
|
"reward_std": 0.12454631552100182, |
|
"rewards/cosine_scaled_reward": -0.1928562317043543, |
|
"rewards/format_reward": 0.37500000558793545, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 2822.541717529297, |
|
"epoch": 0.002285714285714286, |
|
"grad_norm": 0.28548890352249146, |
|
"kl": 0.0, |
|
"learning_rate": 4e-08, |
|
"loss": 0.0, |
|
"reward": 0.11451426180428825, |
|
"reward_std": 0.2134026400744915, |
|
"rewards/cosine_scaled_reward": -0.009885392151772976, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 2819.25, |
|
"epoch": 0.0034285714285714284, |
|
"grad_norm": 0.1939578354358673, |
|
"kl": 3.499537706375122e-05, |
|
"learning_rate": 6e-08, |
|
"loss": 0.0, |
|
"reward": -0.07006961421575397, |
|
"reward_std": 0.11589069850742817, |
|
"rewards/cosine_scaled_reward": -0.29296013712882996, |
|
"rewards/format_reward": 0.31250000186264515, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 2995.2501220703125, |
|
"epoch": 0.004571428571428572, |
|
"grad_norm": 0.21970215439796448, |
|
"kl": 4.45246696472168e-05, |
|
"learning_rate": 8e-08, |
|
"loss": 0.0, |
|
"reward": -0.03315656236372888, |
|
"reward_std": 0.14162674359977245, |
|
"rewards/cosine_scaled_reward": -0.20897372206673026, |
|
"rewards/format_reward": 0.29166668094694614, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 2716.666748046875, |
|
"epoch": 0.005714285714285714, |
|
"grad_norm": 0.17376844584941864, |
|
"kl": 2.6345252990722656e-05, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0, |
|
"reward": 0.1103692501783371, |
|
"reward_std": 0.14548173919320107, |
|
"rewards/cosine_scaled_reward": 0.002536635845899582, |
|
"rewards/format_reward": 0.41666667722165585, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 2795.229248046875, |
|
"epoch": 0.006857142857142857, |
|
"grad_norm": 0.20082038640975952, |
|
"kl": 3.698468208312988e-05, |
|
"learning_rate": 1.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.07470211386680603, |
|
"reward_std": 0.1657848320901394, |
|
"rewards/cosine_scaled_reward": -0.08863592706620693, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 2590.2084350585938, |
|
"epoch": 0.008, |
|
"grad_norm": 0.21514829993247986, |
|
"kl": 2.5466084480285645e-05, |
|
"learning_rate": 1.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.1516738818027079, |
|
"reward_std": 0.24778864905238152, |
|
"rewards/cosine_scaled_reward": 0.03323611244559288, |
|
"rewards/format_reward": 0.520833358168602, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 2860.7708740234375, |
|
"epoch": 0.009142857142857144, |
|
"grad_norm": 0.18623042106628418, |
|
"kl": 3.412365913391113e-05, |
|
"learning_rate": 1.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.005636372021399438, |
|
"reward_std": 0.18859010189771652, |
|
"rewards/cosine_scaled_reward": -0.21989555237814784, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 3180.3541870117188, |
|
"epoch": 0.010285714285714285, |
|
"grad_norm": 0.18793436884880066, |
|
"kl": 3.8176774978637695e-05, |
|
"learning_rate": 1.8e-07, |
|
"loss": 0.0, |
|
"reward": -0.005034097470343113, |
|
"reward_std": 0.17726320587098598, |
|
"rewards/cosine_scaled_reward": -0.13775646989233792, |
|
"rewards/format_reward": 0.2500000149011612, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 2277.3750915527344, |
|
"epoch": 0.011428571428571429, |
|
"grad_norm": 0.24739593267440796, |
|
"kl": 3.692507743835449e-05, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0, |
|
"reward": 0.0970942941494286, |
|
"reward_std": 0.1597892940044403, |
|
"rewards/cosine_scaled_reward": -0.11606822581961751, |
|
"rewards/format_reward": 0.604166679084301, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 2501.1250610351562, |
|
"epoch": 0.012571428571428572, |
|
"grad_norm": 0.29089272022247314, |
|
"kl": 3.090500831604004e-05, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": 0.0, |
|
"reward": 0.14959498681128025, |
|
"reward_std": 0.26211751252412796, |
|
"rewards/cosine_scaled_reward": -0.0010929219424724579, |
|
"rewards/format_reward": 0.5833333656191826, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 2632.9375, |
|
"epoch": 0.013714285714285714, |
|
"grad_norm": 0.23229414224624634, |
|
"kl": 3.573298454284668e-05, |
|
"learning_rate": 2.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.1019544918090105, |
|
"reward_std": 0.23751188814640045, |
|
"rewards/cosine_scaled_reward": -0.04352016560733318, |
|
"rewards/format_reward": 0.4791666716337204, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 2213.3125610351562, |
|
"epoch": 0.014857142857142857, |
|
"grad_norm": 0.1961122304201126, |
|
"kl": 2.2172927856445312e-05, |
|
"learning_rate": 2.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.17769552022218704, |
|
"reward_std": 0.1783296838402748, |
|
"rewards/cosine_scaled_reward": 0.042532917112112045, |
|
"rewards/format_reward": 0.6041666716337204, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 2936.5625610351562, |
|
"epoch": 0.016, |
|
"grad_norm": 0.19352266192436218, |
|
"kl": 3.9771199226379395e-05, |
|
"learning_rate": 2.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.0035927146673202515, |
|
"reward_std": 0.15151787921786308, |
|
"rewards/cosine_scaled_reward": -0.17052022088319063, |
|
"rewards/format_reward": 0.35416667722165585, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 3146.5416870117188, |
|
"epoch": 0.017142857142857144, |
|
"grad_norm": 0.195539191365242, |
|
"kl": 3.9458274841308594e-05, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.045046235201880336, |
|
"reward_std": 0.2458735667169094, |
|
"rewards/cosine_scaled_reward": -0.03827371634542942, |
|
"rewards/format_reward": 0.2500000074505806, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 2227.1875610351562, |
|
"epoch": 0.018285714285714287, |
|
"grad_norm": 0.2652665972709656, |
|
"kl": 1.9550323486328125e-05, |
|
"learning_rate": 3.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.21294382214546204, |
|
"reward_std": 0.22787468880414963, |
|
"rewards/cosine_scaled_reward": 0.13755386415868998, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 3370.7708740234375, |
|
"epoch": 0.019428571428571427, |
|
"grad_norm": 0.21129053831100464, |
|
"kl": 4.503130912780762e-05, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 0.0, |
|
"reward": -0.024287598207592964, |
|
"reward_std": 0.14709143340587616, |
|
"rewards/cosine_scaled_reward": -0.15107670798897743, |
|
"rewards/format_reward": 0.2083333432674408, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 2967.5625610351562, |
|
"epoch": 0.02057142857142857, |
|
"grad_norm": 0.2422505021095276, |
|
"kl": 4.3272972106933594e-05, |
|
"learning_rate": 3.6e-07, |
|
"loss": 0.0, |
|
"reward": -0.025027030613273382, |
|
"reward_std": 0.17510299384593964, |
|
"rewards/cosine_scaled_reward": -0.17359089059755206, |
|
"rewards/format_reward": 0.25000000558793545, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 3309.6041870117188, |
|
"epoch": 0.021714285714285714, |
|
"grad_norm": 0.16750593483448029, |
|
"kl": 3.74913215637207e-05, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 0.0, |
|
"reward": 0.054348187521100044, |
|
"reward_std": 0.26255496218800545, |
|
"rewards/cosine_scaled_reward": -0.041149500757455826, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 2639.2083740234375, |
|
"epoch": 0.022857142857142857, |
|
"grad_norm": 0.2358766347169876, |
|
"kl": 3.248453140258789e-05, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0, |
|
"reward": 0.09739121049642563, |
|
"reward_std": 0.15525865368545055, |
|
"rewards/cosine_scaled_reward": -0.07560409791767597, |
|
"rewards/format_reward": 0.520833358168602, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 2620.1875610351562, |
|
"epoch": 0.024, |
|
"grad_norm": 0.23142506182193756, |
|
"kl": 3.2573938369750977e-05, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 0.0, |
|
"reward": 0.06687073037028313, |
|
"reward_std": 0.10379723459482193, |
|
"rewards/cosine_scaled_reward": -0.06918285926803946, |
|
"rewards/format_reward": 0.39583333395421505, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 3418.6250610351562, |
|
"epoch": 0.025142857142857144, |
|
"grad_norm": 0.23349517583847046, |
|
"kl": 3.2335519790649414e-05, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 0.0, |
|
"reward": -0.09891281835734844, |
|
"reward_std": 0.16107076033949852, |
|
"rewards/cosine_scaled_reward": -0.24483727663755417, |
|
"rewards/format_reward": 0.10416666977107525, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 3000.4168090820312, |
|
"epoch": 0.026285714285714287, |
|
"grad_norm": 0.22397248446941376, |
|
"kl": 4.8041343688964844e-05, |
|
"learning_rate": 4.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.06495004217140377, |
|
"reward_std": 0.19814502075314522, |
|
"rewards/cosine_scaled_reward": -0.09647991880774498, |
|
"rewards/format_reward": 0.43750001303851604, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 2132.4583740234375, |
|
"epoch": 0.027428571428571427, |
|
"grad_norm": 0.2273954451084137, |
|
"kl": 1.659989356994629e-05, |
|
"learning_rate": 4.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.19651076383888721, |
|
"reward_std": 0.20518572628498077, |
|
"rewards/cosine_scaled_reward": 0.04420430213212967, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 2864.8125610351562, |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.2530263364315033, |
|
"kl": 2.9861927032470703e-05, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 0.09658885933458805, |
|
"reward_std": 0.15381062403321266, |
|
"rewards/cosine_scaled_reward": -0.010462287813425064, |
|
"rewards/format_reward": 0.3958333395421505, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 3151.791748046875, |
|
"epoch": 0.029714285714285714, |
|
"grad_norm": 0.15439528226852417, |
|
"kl": 2.0995736122131348e-05, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.020544751780107617, |
|
"reward_std": 0.16202639788389206, |
|
"rewards/cosine_scaled_reward": -0.07661792263388634, |
|
"rewards/format_reward": 0.229166679084301, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 2976.416748046875, |
|
"epoch": 0.030857142857142857, |
|
"grad_norm": 0.20570415258407593, |
|
"kl": 2.8975307941436768e-05, |
|
"learning_rate": 5.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.01607461948879063, |
|
"reward_std": 0.12287303619086742, |
|
"rewards/cosine_scaled_reward": -0.10305411368608475, |
|
"rewards/format_reward": 0.27083333395421505, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 3119.3125610351562, |
|
"epoch": 0.032, |
|
"grad_norm": 0.21759092807769775, |
|
"kl": 3.6269426345825195e-05, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.045295797288417816, |
|
"reward_std": 0.12472150847315788, |
|
"rewards/cosine_scaled_reward": -0.06871754361782223, |
|
"rewards/format_reward": 0.31250000558793545, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 3055.291748046875, |
|
"epoch": 0.03314285714285714, |
|
"grad_norm": 0.17530092597007751, |
|
"kl": 1.576542854309082e-05, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.03692814148962498, |
|
"reward_std": 0.2673305310308933, |
|
"rewards/cosine_scaled_reward": -0.08717780001461506, |
|
"rewards/format_reward": 0.31250001676380634, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 3088.9375610351562, |
|
"epoch": 0.03428571428571429, |
|
"grad_norm": 0.23589830100536346, |
|
"kl": 2.016127109527588e-05, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0, |
|
"reward": 0.039040276780724525, |
|
"reward_std": 0.18401793204247952, |
|
"rewards/cosine_scaled_reward": -0.11255598999559879, |
|
"rewards/format_reward": 0.37500001303851604, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 2690.041748046875, |
|
"epoch": 0.03542857142857143, |
|
"grad_norm": 0.25512003898620605, |
|
"kl": 2.226606011390686e-05, |
|
"learning_rate": 6.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.02995466347783804, |
|
"reward_std": 0.1407727226614952, |
|
"rewards/cosine_scaled_reward": -0.1338973045349121, |
|
"rewards/format_reward": 0.37500001303851604, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 3518.2916870117188, |
|
"epoch": 0.036571428571428574, |
|
"grad_norm": 0.15214499831199646, |
|
"kl": 1.8969178199768066e-05, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0, |
|
"reward": -0.052682604640722275, |
|
"reward_std": 0.19048137590289116, |
|
"rewards/cosine_scaled_reward": -0.1558831539005041, |
|
"rewards/format_reward": 0.10416666977107525, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 2984.3541870117188, |
|
"epoch": 0.037714285714285714, |
|
"grad_norm": 0.19921286404132843, |
|
"kl": 2.2009015083312988e-05, |
|
"learning_rate": 6.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.06904905498959124, |
|
"reward_std": 0.18269503861665726, |
|
"rewards/cosine_scaled_reward": -0.08260507695376873, |
|
"rewards/format_reward": 0.43750002048909664, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 3051.0208435058594, |
|
"epoch": 0.038857142857142854, |
|
"grad_norm": 0.19640006124973297, |
|
"kl": 2.3230910301208496e-05, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.0, |
|
"reward": -0.025606604642234743, |
|
"reward_std": 0.23111771792173386, |
|
"rewards/cosine_scaled_reward": -0.17444449942559004, |
|
"rewards/format_reward": 0.25000000558793545, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 2780.9376220703125, |
|
"epoch": 0.04, |
|
"grad_norm": 0.18088482320308685, |
|
"kl": 2.244114875793457e-05, |
|
"learning_rate": 7e-07, |
|
"loss": 0.0, |
|
"reward": 0.10355074889957905, |
|
"reward_std": 0.20479629933834076, |
|
"rewards/cosine_scaled_reward": -0.0417685154825449, |
|
"rewards/format_reward": 0.47916668467223644, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 2462.7083740234375, |
|
"epoch": 0.04114285714285714, |
|
"grad_norm": 0.18930543959140778, |
|
"kl": 5.8710575103759766e-05, |
|
"learning_rate": 7.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.1567097045481205, |
|
"reward_std": 0.12252922169864178, |
|
"rewards/cosine_scaled_reward": 0.0724838562309742, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 2707.2500915527344, |
|
"epoch": 0.04228571428571429, |
|
"grad_norm": 0.20084795355796814, |
|
"kl": 4.0590763092041016e-05, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.13954784779343754, |
|
"reward_std": 0.1899961344897747, |
|
"rewards/cosine_scaled_reward": 0.006025645881891251, |
|
"rewards/format_reward": 0.5208333488553762, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 2946.6250610351562, |
|
"epoch": 0.04342857142857143, |
|
"grad_norm": 0.2238946110010147, |
|
"kl": 5.5283308029174805e-05, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0, |
|
"reward": -0.01690117083489895, |
|
"reward_std": 0.18682749196887016, |
|
"rewards/cosine_scaled_reward": -0.20232924073934555, |
|
"rewards/format_reward": 0.3333333469927311, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 2901.9584350585938, |
|
"epoch": 0.044571428571428574, |
|
"grad_norm": 0.22113607823848724, |
|
"kl": 5.8710575103759766e-05, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.09138605836778879, |
|
"reward_std": 0.25620007514953613, |
|
"rewards/cosine_scaled_reward": -0.05268890131264925, |
|
"rewards/format_reward": 0.4583333544433117, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 3042.3334350585938, |
|
"epoch": 0.045714285714285714, |
|
"grad_norm": 0.17167918384075165, |
|
"kl": 3.018975257873535e-05, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0, |
|
"reward": 0.025605826638638973, |
|
"reward_std": 0.17767371982336044, |
|
"rewards/cosine_scaled_reward": -0.11477911379188299, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 3265.6875610351562, |
|
"epoch": 0.046857142857142854, |
|
"grad_norm": 0.15757694840431213, |
|
"kl": 1.1652708053588867e-05, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.032783683855086565, |
|
"reward_std": 0.13638111762702465, |
|
"rewards/cosine_scaled_reward": -0.07333838939666748, |
|
"rewards/format_reward": 0.2708333358168602, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 2019.4584350585938, |
|
"epoch": 0.048, |
|
"grad_norm": 0.2902780771255493, |
|
"kl": 0.00019973516464233398, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.2274437490850687, |
|
"reward_std": 0.20474949106574059, |
|
"rewards/cosine_scaled_reward": 0.08997760340571404, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 3191.1250610351562, |
|
"epoch": 0.04914285714285714, |
|
"grad_norm": 0.199421688914299, |
|
"kl": 3.5726698115468025e-05, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.00293779862113297, |
|
"reward_std": 0.2170444093644619, |
|
"rewards/cosine_scaled_reward": -0.1411289218813181, |
|
"rewards/format_reward": 0.291666679084301, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 2911.6875610351562, |
|
"epoch": 0.05028571428571429, |
|
"grad_norm": 0.20561084151268005, |
|
"kl": 0.0003883242607116699, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.06697382591664791, |
|
"reward_std": 0.14113801531493664, |
|
"rewards/cosine_scaled_reward": -0.07008513808250427, |
|
"rewards/format_reward": 0.3958333432674408, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 2605.666717529297, |
|
"epoch": 0.05142857142857143, |
|
"grad_norm": 0.18309232592582703, |
|
"kl": 4.357099533081055e-05, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0, |
|
"reward": 0.2496887871529907, |
|
"reward_std": 0.23000844940543175, |
|
"rewards/cosine_scaled_reward": 0.19401046447455883, |
|
"rewards/format_reward": 0.5833333507180214, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 2926.604217529297, |
|
"epoch": 0.052571428571428575, |
|
"grad_norm": 0.1925714910030365, |
|
"kl": 0.00014609098434448242, |
|
"learning_rate": 9.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.028744973242282867, |
|
"reward_std": 0.1495701353996992, |
|
"rewards/cosine_scaled_reward": -0.10348123731091619, |
|
"rewards/format_reward": 0.3125, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 2612.3125915527344, |
|
"epoch": 0.053714285714285714, |
|
"grad_norm": 0.19961993396282196, |
|
"kl": 9.500980377197266e-05, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.08338814397575334, |
|
"reward_std": 0.19680871814489365, |
|
"rewards/cosine_scaled_reward": -0.08348039817065, |
|
"rewards/format_reward": 0.4791666716337204, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 2722.3959350585938, |
|
"epoch": 0.054857142857142854, |
|
"grad_norm": 0.21567687392234802, |
|
"kl": 9.578466415405273e-05, |
|
"learning_rate": 9.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.09677822515368462, |
|
"reward_std": 0.2861610949039459, |
|
"rewards/cosine_scaled_reward": -0.04573226906359196, |
|
"rewards/format_reward": 0.45833334885537624, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 2226.354217529297, |
|
"epoch": 0.056, |
|
"grad_norm": 0.38961726427078247, |
|
"kl": 0.000255584716796875, |
|
"learning_rate": 9.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.15269484370946884, |
|
"reward_std": 0.18540722876787186, |
|
"rewards/cosine_scaled_reward": -0.019495231565088034, |
|
"rewards/format_reward": 0.625, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 2191.729217529297, |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.25581982731819153, |
|
"kl": 0.000471651554107666, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0, |
|
"reward": 0.1666366644203663, |
|
"reward_std": 0.16899916529655457, |
|
"rewards/cosine_scaled_reward": 0.057675519958138466, |
|
"rewards/format_reward": 0.520833358168602, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 3260.0416870117188, |
|
"epoch": 0.05828571428571429, |
|
"grad_norm": 0.2235163003206253, |
|
"kl": 0.00012612342834472656, |
|
"learning_rate": 9.999890338174275e-07, |
|
"loss": 0.0, |
|
"reward": -0.0477819973602891, |
|
"reward_std": 0.16141689382493496, |
|
"rewards/cosine_scaled_reward": -0.20123709551990032, |
|
"rewards/format_reward": 0.2083333395421505, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 3330.979248046875, |
|
"epoch": 0.05942857142857143, |
|
"grad_norm": 0.176573246717453, |
|
"kl": 0.00023108720779418945, |
|
"learning_rate": 9.999561358041868e-07, |
|
"loss": 0.0, |
|
"reward": -0.0365308066830039, |
|
"reward_std": 0.14085101708769798, |
|
"rewards/cosine_scaled_reward": -0.1649935580790043, |
|
"rewards/format_reward": 0.18750000558793545, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 3004.7083740234375, |
|
"epoch": 0.060571428571428575, |
|
"grad_norm": 0.20955270528793335, |
|
"kl": 0.00027896463871002197, |
|
"learning_rate": 9.999013075636804e-07, |
|
"loss": 0.0, |
|
"reward": -0.003909507766366005, |
|
"reward_std": 0.11847533471882343, |
|
"rewards/cosine_scaled_reward": -0.14490897953510284, |
|
"rewards/format_reward": 0.2708333395421505, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 2624.6875610351562, |
|
"epoch": 0.061714285714285715, |
|
"grad_norm": 0.21326404809951782, |
|
"kl": 0.00024962425231933594, |
|
"learning_rate": 9.998245517681593e-07, |
|
"loss": 0.0, |
|
"reward": -0.006349522154778242, |
|
"reward_std": 0.10255092941224575, |
|
"rewards/cosine_scaled_reward": -0.21963607892394066, |
|
"rewards/format_reward": 0.4166666865348816, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 3092.479248046875, |
|
"epoch": 0.06285714285714286, |
|
"grad_norm": 0.16402700543403625, |
|
"kl": 0.00016164779663085938, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.0, |
|
"reward": -0.02835557982325554, |
|
"reward_std": 0.16623086854815483, |
|
"rewards/cosine_scaled_reward": -0.1794952228665352, |
|
"rewards/format_reward": 0.25000000558793545, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 3083.1458740234375, |
|
"epoch": 0.064, |
|
"grad_norm": 0.22414982318878174, |
|
"kl": 0.0007500648498535156, |
|
"learning_rate": 9.996052735444862e-07, |
|
"loss": 0.0, |
|
"reward": 0.025544505566358566, |
|
"reward_std": 0.20646458864212036, |
|
"rewards/cosine_scaled_reward": -0.08164806384593248, |
|
"rewards/format_reward": 0.27083333395421505, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 3015.5000610351562, |
|
"epoch": 0.06514285714285714, |
|
"grad_norm": 0.26501360535621643, |
|
"kl": 0.0005974769592285156, |
|
"learning_rate": 9.994627618036452e-07, |
|
"loss": 0.0, |
|
"reward": 0.0491860609035939, |
|
"reward_std": 0.1841455027461052, |
|
"rewards/cosine_scaled_reward": -0.09901990741491318, |
|
"rewards/format_reward": 0.39583333395421505, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 3105.1666870117188, |
|
"epoch": 0.06628571428571428, |
|
"grad_norm": 0.1655452996492386, |
|
"kl": 0.0008473992347717285, |
|
"learning_rate": 9.992983438818915e-07, |
|
"loss": 0.0, |
|
"reward": -0.05875010509043932, |
|
"reward_std": 0.08936690539121628, |
|
"rewards/cosine_scaled_reward": -0.23779355734586716, |
|
"rewards/format_reward": 0.25, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 2726.291748046875, |
|
"epoch": 0.06742857142857143, |
|
"grad_norm": 0.2481091469526291, |
|
"kl": 0.0003948211669921875, |
|
"learning_rate": 9.991120277927223e-07, |
|
"loss": 0.0, |
|
"reward": 0.1814795120153576, |
|
"reward_std": 0.23579821549355984, |
|
"rewards/cosine_scaled_reward": 0.11390005052089691, |
|
"rewards/format_reward": 0.4791666679084301, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 2841.291748046875, |
|
"epoch": 0.06857142857142857, |
|
"grad_norm": 0.196300208568573, |
|
"kl": 0.0006402730941772461, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": 0.0, |
|
"reward": 0.09838544577360153, |
|
"reward_std": 0.1272505521774292, |
|
"rewards/cosine_scaled_reward": 0.030779220163822174, |
|
"rewards/format_reward": 0.31250000186264515, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 2196.8333740234375, |
|
"epoch": 0.06971428571428571, |
|
"grad_norm": 0.1789156198501587, |
|
"kl": 0.0003542900085449219, |
|
"learning_rate": 9.98673738502114e-07, |
|
"loss": 0.0, |
|
"reward": 0.2810430023819208, |
|
"reward_std": 0.11581777688115835, |
|
"rewards/cosine_scaled_reward": 0.23955225199460983, |
|
"rewards/format_reward": 0.6041666716337204, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 3327.354248046875, |
|
"epoch": 0.07085714285714285, |
|
"grad_norm": 0.15294106304645538, |
|
"kl": 0.00029337406158447266, |
|
"learning_rate": 9.98421786662277e-07, |
|
"loss": 0.0, |
|
"reward": 0.07980241999030113, |
|
"reward_std": 0.2444281317293644, |
|
"rewards/cosine_scaled_reward": -0.01411302387714386, |
|
"rewards/format_reward": 0.33333334140479565, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 2406.1875610351562, |
|
"epoch": 0.072, |
|
"grad_norm": 0.18347583711147308, |
|
"kl": 0.0021266937255859375, |
|
"learning_rate": 9.981479793771866e-07, |
|
"loss": 0.0001, |
|
"reward": 0.2145740818232298, |
|
"reward_std": 0.17063674330711365, |
|
"rewards/cosine_scaled_reward": 0.11840518936514854, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 2985.166748046875, |
|
"epoch": 0.07314285714285715, |
|
"grad_norm": 0.16115514934062958, |
|
"kl": 0.00035321712493896484, |
|
"learning_rate": 9.97852329991824e-07, |
|
"loss": 0.0, |
|
"reward": 0.11250189319252968, |
|
"reward_std": 0.17308304831385612, |
|
"rewards/cosine_scaled_reward": 0.03135997918434441, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 2662.2083740234375, |
|
"epoch": 0.07428571428571429, |
|
"grad_norm": 0.19161196053028107, |
|
"kl": 0.000934600830078125, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": 0.0, |
|
"reward": 0.18816682742908597, |
|
"reward_std": 0.19341924414038658, |
|
"rewards/cosine_scaled_reward": 0.11808005906641483, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 3121.8125610351562, |
|
"epoch": 0.07542857142857143, |
|
"grad_norm": 0.15908025205135345, |
|
"kl": 0.0005314350128173828, |
|
"learning_rate": 9.971955636222684e-07, |
|
"loss": 0.0, |
|
"reward": 0.07345604291185737, |
|
"reward_std": 0.16167625226080418, |
|
"rewards/cosine_scaled_reward": -0.04804755933582783, |
|
"rewards/format_reward": 0.37500000186264515, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 2960.312530517578, |
|
"epoch": 0.07657142857142857, |
|
"grad_norm": 0.19106991589069366, |
|
"kl": 0.0009171962738037109, |
|
"learning_rate": 9.968344786479415e-07, |
|
"loss": 0.0, |
|
"reward": 0.11973061971366405, |
|
"reward_std": 0.23938697017729282, |
|
"rewards/cosine_scaled_reward": 0.04380590561777353, |
|
"rewards/format_reward": 0.37500001676380634, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 2512.5625610351562, |
|
"epoch": 0.07771428571428571, |
|
"grad_norm": 0.27955880761146545, |
|
"kl": 0.0024480819702148438, |
|
"learning_rate": 9.964516155915151e-07, |
|
"loss": 0.0001, |
|
"reward": 0.032972510904073715, |
|
"reward_std": 0.16721198707818985, |
|
"rewards/cosine_scaled_reward": -0.14768926287069917, |
|
"rewards/format_reward": 0.41666667722165585, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 2698.6250915527344, |
|
"epoch": 0.07885714285714286, |
|
"grad_norm": 0.1807398796081543, |
|
"kl": 0.0006427764892578125, |
|
"learning_rate": 9.960469931131936e-07, |
|
"loss": 0.0, |
|
"reward": 0.14363489238894545, |
|
"reward_std": 0.17969760112464428, |
|
"rewards/cosine_scaled_reward": 0.050120849162340164, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 3165.1458740234375, |
|
"epoch": 0.08, |
|
"grad_norm": 0.17745569348335266, |
|
"kl": 0.0035309791564941406, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0001, |
|
"reward": 0.015050832647830248, |
|
"reward_std": 0.14714835956692696, |
|
"rewards/cosine_scaled_reward": -0.1469174176454544, |
|
"rewards/format_reward": 0.3541666716337204, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 2699.7709045410156, |
|
"epoch": 0.08114285714285714, |
|
"grad_norm": 0.22869746387004852, |
|
"kl": 0.00305938720703125, |
|
"learning_rate": 9.951725498333448e-07, |
|
"loss": 0.0001, |
|
"reward": 0.04336157673969865, |
|
"reward_std": 0.17576204612851143, |
|
"rewards/cosine_scaled_reward": -0.11572509631514549, |
|
"rewards/format_reward": 0.3958333469927311, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 2606.291748046875, |
|
"epoch": 0.08228571428571428, |
|
"grad_norm": 0.2361251413822174, |
|
"kl": 0.0014438629150390625, |
|
"learning_rate": 9.947027716509488e-07, |
|
"loss": 0.0001, |
|
"reward": 0.2736402824521065, |
|
"reward_std": 0.2642326597124338, |
|
"rewards/cosine_scaled_reward": 0.2026251358911395, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 2170.729248046875, |
|
"epoch": 0.08342857142857144, |
|
"grad_norm": 0.25033605098724365, |
|
"kl": 0.0022487640380859375, |
|
"learning_rate": 9.942113192828444e-07, |
|
"loss": 0.0001, |
|
"reward": 0.265652135014534, |
|
"reward_std": 0.19880715385079384, |
|
"rewards/cosine_scaled_reward": 0.17191709205508232, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 2448.6251220703125, |
|
"epoch": 0.08457142857142858, |
|
"grad_norm": 0.2048778533935547, |
|
"kl": 0.0019674301147460938, |
|
"learning_rate": 9.93698216681727e-07, |
|
"loss": 0.0001, |
|
"reward": 0.08278486505150795, |
|
"reward_std": 0.1648626308888197, |
|
"rewards/cosine_scaled_reward": -0.10248487256467342, |
|
"rewards/format_reward": 0.5208333432674408, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 2838.2500610351562, |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.2654828727245331, |
|
"kl": 0.0029697418212890625, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": 0.0001, |
|
"reward": 0.06769379088655114, |
|
"reward_std": 0.24208112806081772, |
|
"rewards/cosine_scaled_reward": -0.04729684395715594, |
|
"rewards/format_reward": 0.3541666679084301, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 2911.5209350585938, |
|
"epoch": 0.08685714285714285, |
|
"grad_norm": 0.17353959381580353, |
|
"kl": 0.0006062984466552734, |
|
"learning_rate": 9.926071618660237e-07, |
|
"loss": 0.0, |
|
"reward": 0.060059760231524706, |
|
"reward_std": 0.19824261032044888, |
|
"rewards/cosine_scaled_reward": -0.07150101102888584, |
|
"rewards/format_reward": 0.37500000558793545, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 3174.6875610351562, |
|
"epoch": 0.088, |
|
"grad_norm": 0.15684625506401062, |
|
"kl": 0.00110626220703125, |
|
"learning_rate": 9.9202926282791e-07, |
|
"loss": 0.0, |
|
"reward": 0.03418249450623989, |
|
"reward_std": 0.17063240334391594, |
|
"rewards/cosine_scaled_reward": -0.05116889998316765, |
|
"rewards/format_reward": 0.22916666977107525, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 2902.9375, |
|
"epoch": 0.08914285714285715, |
|
"grad_norm": 0.209213986992836, |
|
"kl": 0.0010857582092285156, |
|
"learning_rate": 9.91429819907136e-07, |
|
"loss": 0.0, |
|
"reward": 0.10875159315764904, |
|
"reward_std": 0.13933855667710304, |
|
"rewards/cosine_scaled_reward": -0.0003913678228855133, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 2852.0416870117188, |
|
"epoch": 0.09028571428571429, |
|
"grad_norm": 0.19134745001792908, |
|
"kl": 0.0005846023559570312, |
|
"learning_rate": 9.908088623197048e-07, |
|
"loss": 0.0, |
|
"reward": 0.12408905290067196, |
|
"reward_std": 0.227988138794899, |
|
"rewards/cosine_scaled_reward": 0.002291955053806305, |
|
"rewards/format_reward": 0.479166679084301, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 3436.8125, |
|
"epoch": 0.09142857142857143, |
|
"grad_norm": 0.15251587331295013, |
|
"kl": 0.0006704330444335938, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.0, |
|
"reward": -0.02238150453194976, |
|
"reward_std": 0.1688902247697115, |
|
"rewards/cosine_scaled_reward": -0.12677161488682032, |
|
"rewards/format_reward": 0.1666666716337204, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 3266.9375610351562, |
|
"epoch": 0.09257142857142857, |
|
"grad_norm": 0.17284975945949554, |
|
"kl": 0.0026865005493164062, |
|
"learning_rate": 9.895025252503755e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0007507335394620895, |
|
"reward_std": 0.1353786587715149, |
|
"rewards/cosine_scaled_reward": -0.09297612681984901, |
|
"rewards/format_reward": 0.18750000186264515, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 3086.416748046875, |
|
"epoch": 0.09371428571428571, |
|
"grad_norm": 0.18978342413902283, |
|
"kl": 0.0010721683502197266, |
|
"learning_rate": 9.888172094375033e-07, |
|
"loss": 0.0, |
|
"reward": 0.06928094290196896, |
|
"reward_std": 0.18230855278670788, |
|
"rewards/cosine_scaled_reward": -0.0865684850141406, |
|
"rewards/format_reward": 0.4375000111758709, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 3473.8958740234375, |
|
"epoch": 0.09485714285714286, |
|
"grad_norm": 0.17331495881080627, |
|
"kl": 0.0004067420959472656, |
|
"learning_rate": 9.881105062929221e-07, |
|
"loss": 0.0, |
|
"reward": -0.11179337278008461, |
|
"reward_std": 0.1241249330341816, |
|
"rewards/cosine_scaled_reward": -0.2696155607700348, |
|
"rewards/format_reward": 0.1041666679084301, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 2710.541748046875, |
|
"epoch": 0.096, |
|
"grad_norm": 0.18735957145690918, |
|
"kl": 0.00074005126953125, |
|
"learning_rate": 9.873824502603459e-07, |
|
"loss": 0.0, |
|
"reward": 0.2588787730783224, |
|
"reward_std": 0.20492861978709698, |
|
"rewards/cosine_scaled_reward": 0.19406858971342444, |
|
"rewards/format_reward": 0.604166679084301, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 3282.8333740234375, |
|
"epoch": 0.09714285714285714, |
|
"grad_norm": 0.1622576266527176, |
|
"kl": 0.00115203857421875, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0, |
|
"reward": 0.07870295969769359, |
|
"reward_std": 0.12834673561155796, |
|
"rewards/cosine_scaled_reward": -0.0030822306871414185, |
|
"rewards/format_reward": 0.3125000074505806, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 2859.2083740234375, |
|
"epoch": 0.09828571428571428, |
|
"grad_norm": 0.20539958775043488, |
|
"kl": 0.0013775825500488281, |
|
"learning_rate": 9.85862422507884e-07, |
|
"loss": 0.0001, |
|
"reward": 0.020047522732056677, |
|
"reward_std": 0.17210980132222176, |
|
"rewards/cosine_scaled_reward": -0.14074895903468132, |
|
"rewards/format_reward": 0.3541666679084301, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 3006.3125610351562, |
|
"epoch": 0.09942857142857142, |
|
"grad_norm": 0.19360637664794922, |
|
"kl": 0.0025768280029296875, |
|
"learning_rate": 9.850705248720068e-07, |
|
"loss": 0.0001, |
|
"reward": -0.005380205810070038, |
|
"reward_std": 0.16785390488803387, |
|
"rewards/cosine_scaled_reward": -0.1755614336580038, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 2985.729248046875, |
|
"epoch": 0.10057142857142858, |
|
"grad_norm": 0.17934949696063995, |
|
"kl": 0.0023851394653320312, |
|
"learning_rate": 9.8425742251254e-07, |
|
"loss": 0.0001, |
|
"reward": 0.13574134244117886, |
|
"reward_std": 0.21831882745027542, |
|
"rewards/cosine_scaled_reward": 0.0448464211076498, |
|
"rewards/format_reward": 0.4375000074505806, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 3279.5208740234375, |
|
"epoch": 0.10171428571428572, |
|
"grad_norm": 0.17610202729701996, |
|
"kl": 0.005316257476806641, |
|
"learning_rate": 9.83423155058946e-07, |
|
"loss": 0.0002, |
|
"reward": 0.08696338161826134, |
|
"reward_std": 0.19547893106937408, |
|
"rewards/cosine_scaled_reward": 0.030678212642669678, |
|
"rewards/format_reward": 0.27083334140479565, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 3082.6875, |
|
"epoch": 0.10285714285714286, |
|
"grad_norm": 0.1900225430727005, |
|
"kl": 0.001110076904296875, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0, |
|
"reward": 0.04298854619264603, |
|
"reward_std": 0.20502052083611488, |
|
"rewards/cosine_scaled_reward": -0.06164951249957085, |
|
"rewards/format_reward": 0.2916666828095913, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 3002.5833740234375, |
|
"epoch": 0.104, |
|
"grad_norm": 0.18745382130146027, |
|
"kl": 0.0011818408966064453, |
|
"learning_rate": 9.816912885430258e-07, |
|
"loss": 0.0, |
|
"reward": 0.0811410085298121, |
|
"reward_std": 0.1805788390338421, |
|
"rewards/cosine_scaled_reward": -0.05142554081976414, |
|
"rewards/format_reward": 0.41666667349636555, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 2689.104248046875, |
|
"epoch": 0.10514285714285715, |
|
"grad_norm": 0.19328680634498596, |
|
"kl": 0.001689910888671875, |
|
"learning_rate": 9.807937738894303e-07, |
|
"loss": 0.0001, |
|
"reward": 0.10821354016661644, |
|
"reward_std": 0.1421743929386139, |
|
"rewards/cosine_scaled_reward": -0.05042751878499985, |
|
"rewards/format_reward": 0.520833358168602, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 2476.979217529297, |
|
"epoch": 0.10628571428571429, |
|
"grad_norm": 0.21367445588111877, |
|
"kl": 0.0015621185302734375, |
|
"learning_rate": 9.798752629550546e-07, |
|
"loss": 0.0001, |
|
"reward": 0.14456679113209248, |
|
"reward_std": 0.1662643477320671, |
|
"rewards/cosine_scaled_reward": 0.03526845946907997, |
|
"rewards/format_reward": 0.47916667722165585, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 3352.0208740234375, |
|
"epoch": 0.10742857142857143, |
|
"grad_norm": 0.20487773418426514, |
|
"kl": 0.00179290771484375, |
|
"learning_rate": 9.78935800506826e-07, |
|
"loss": 0.0001, |
|
"reward": 0.04233929002657533, |
|
"reward_std": 0.1873803436756134, |
|
"rewards/cosine_scaled_reward": -0.0499027743935585, |
|
"rewards/format_reward": 0.2708333469927311, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 3058.8125, |
|
"epoch": 0.10857142857142857, |
|
"grad_norm": 0.2256304770708084, |
|
"kl": 0.0010938644409179688, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.0, |
|
"reward": -0.011163771152496338, |
|
"reward_std": 0.1628007385879755, |
|
"rewards/cosine_scaled_reward": -0.16808456648141146, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 2390.4166870117188, |
|
"epoch": 0.10971428571428571, |
|
"grad_norm": 0.20839492976665497, |
|
"kl": 0.001537322998046875, |
|
"learning_rate": 9.769942052400235e-07, |
|
"loss": 0.0001, |
|
"reward": 0.08394679566845298, |
|
"reward_std": 0.1445230431854725, |
|
"rewards/cosine_scaled_reward": -0.0893700122833252, |
|
"rewards/format_reward": 0.5000000149011612, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 2910.2709350585938, |
|
"epoch": 0.11085714285714286, |
|
"grad_norm": 0.20374347269535065, |
|
"kl": 0.001979351043701172, |
|
"learning_rate": 9.759921670520634e-07, |
|
"loss": 0.0001, |
|
"reward": 0.09348384477198124, |
|
"reward_std": 0.1994321532547474, |
|
"rewards/cosine_scaled_reward": 0.0048431046307086945, |
|
"rewards/format_reward": 0.33333333395421505, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 2694.2083740234375, |
|
"epoch": 0.112, |
|
"grad_norm": 0.26260530948638916, |
|
"kl": 0.008546829223632812, |
|
"learning_rate": 9.749693666068663e-07, |
|
"loss": 0.0003, |
|
"reward": 0.06050444394350052, |
|
"reward_std": 0.16644578985869884, |
|
"rewards/cosine_scaled_reward": -0.08548790030181408, |
|
"rewards/format_reward": 0.3958333432674408, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 2938.6875, |
|
"epoch": 0.11314285714285714, |
|
"grad_norm": 0.22185884416103363, |
|
"kl": 0.001251220703125, |
|
"learning_rate": 9.739258537542835e-07, |
|
"loss": 0.0001, |
|
"reward": 0.12049873173236847, |
|
"reward_std": 0.12151942402124405, |
|
"rewards/cosine_scaled_reward": 0.03361584059894085, |
|
"rewards/format_reward": 0.3958333544433117, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 3105.5834350585938, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.18423013389110565, |
|
"kl": 0.0013895034790039062, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0001, |
|
"reward": 0.025272036204114556, |
|
"reward_std": 0.20255185291171074, |
|
"rewards/cosine_scaled_reward": -0.10323571693152189, |
|
"rewards/format_reward": 0.31250000558793545, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 2749.250030517578, |
|
"epoch": 0.11542857142857142, |
|
"grad_norm": 0.18827371299266815, |
|
"kl": 0.00228118896484375, |
|
"learning_rate": 9.717768952713511e-07, |
|
"loss": 0.0001, |
|
"reward": 0.03973736334592104, |
|
"reward_std": 0.17379421554505825, |
|
"rewards/cosine_scaled_reward": -0.15294075850397348, |
|
"rewards/format_reward": 0.4583333544433117, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 2767.0834350585938, |
|
"epoch": 0.11657142857142858, |
|
"grad_norm": 0.16772513091564178, |
|
"kl": 0.002315521240234375, |
|
"learning_rate": 9.706715543782064e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0401492640376091, |
|
"reward_std": 0.14047331921756268, |
|
"rewards/cosine_scaled_reward": -0.11048189923167229, |
|
"rewards/format_reward": 0.375, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 3021.2708740234375, |
|
"epoch": 0.11771428571428572, |
|
"grad_norm": 0.1802755743265152, |
|
"kl": 0.001617431640625, |
|
"learning_rate": 9.695457105469804e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0870060611050576, |
|
"reward_std": 0.17526693642139435, |
|
"rewards/cosine_scaled_reward": -0.03904236480593681, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 2614.1458740234375, |
|
"epoch": 0.11885714285714286, |
|
"grad_norm": 0.1638043224811554, |
|
"kl": 0.0009593963623046875, |
|
"learning_rate": 9.683994186497132e-07, |
|
"loss": 0.0, |
|
"reward": 0.23584382608532906, |
|
"reward_std": 0.19611848145723343, |
|
"rewards/cosine_scaled_reward": 0.20568424928933382, |
|
"rewards/format_reward": 0.5, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 3105.791717529297, |
|
"epoch": 0.12, |
|
"grad_norm": 0.18571417033672333, |
|
"kl": 0.00255584716796875, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": 0.0001, |
|
"reward": -0.0297448318451643, |
|
"reward_std": 0.1015834640711546, |
|
"rewards/cosine_scaled_reward": -0.1741093248128891, |
|
"rewards/format_reward": 0.2291666679084301, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 2815.8126220703125, |
|
"epoch": 0.12114285714285715, |
|
"grad_norm": 0.21194417774677277, |
|
"kl": 0.0025634765625, |
|
"learning_rate": 9.66045715125541e-07, |
|
"loss": 0.0001, |
|
"reward": 0.11232324969023466, |
|
"reward_std": 0.21721220761537552, |
|
"rewards/cosine_scaled_reward": 0.01769975572824478, |
|
"rewards/format_reward": 0.3958333432674408, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 2541.416748046875, |
|
"epoch": 0.12228571428571429, |
|
"grad_norm": 0.22722160816192627, |
|
"kl": 0.002227783203125, |
|
"learning_rate": 9.648384182148252e-07, |
|
"loss": 0.0001, |
|
"reward": 0.019650347530841827, |
|
"reward_std": 0.21003135293722153, |
|
"rewards/cosine_scaled_reward": -0.17047632485628128, |
|
"rewards/format_reward": 0.41666667722165585, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 2334.2500915527344, |
|
"epoch": 0.12342857142857143, |
|
"grad_norm": 0.24612505733966827, |
|
"kl": 0.0037975311279296875, |
|
"learning_rate": 9.636109026648554e-07, |
|
"loss": 0.0002, |
|
"reward": 0.24177123652771115, |
|
"reward_std": 0.19179029762744904, |
|
"rewards/cosine_scaled_reward": 0.16906813159585, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 2786.5833740234375, |
|
"epoch": 0.12457142857142857, |
|
"grad_norm": 0.23426760733127594, |
|
"kl": 0.002147674560546875, |
|
"learning_rate": 9.623632283030077e-07, |
|
"loss": 0.0001, |
|
"reward": 0.019255569204688072, |
|
"reward_std": 0.19491948932409286, |
|
"rewards/cosine_scaled_reward": -0.15080422349274158, |
|
"rewards/format_reward": 0.3750000149011612, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 2611.3959045410156, |
|
"epoch": 0.12571428571428572, |
|
"grad_norm": 0.2736619710922241, |
|
"kl": 0.0035686492919921875, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0001, |
|
"reward": 0.29651589691638947, |
|
"reward_std": 0.18929306417703629, |
|
"rewards/cosine_scaled_reward": 0.23898081667721272, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 2047.7917175292969, |
|
"epoch": 0.12685714285714286, |
|
"grad_norm": 0.22474665939807892, |
|
"kl": 0.00275421142578125, |
|
"learning_rate": 9.598076473627796e-07, |
|
"loss": 0.0001, |
|
"reward": 0.19512577797286212, |
|
"reward_std": 0.17880065739154816, |
|
"rewards/cosine_scaled_reward": 0.03617064421996474, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 2695.3958740234375, |
|
"epoch": 0.128, |
|
"grad_norm": 0.20437544584274292, |
|
"kl": 0.0023288726806640625, |
|
"learning_rate": 9.58499865339809e-07, |
|
"loss": 0.0001, |
|
"reward": 0.18288636580109596, |
|
"reward_std": 0.19504074566066265, |
|
"rewards/cosine_scaled_reward": 0.07991745974868536, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 2568.4583435058594, |
|
"epoch": 0.12914285714285714, |
|
"grad_norm": 0.20730017125606537, |
|
"kl": 0.0066070556640625, |
|
"learning_rate": 9.571721736097088e-07, |
|
"loss": 0.0003, |
|
"reward": 0.13345283642411232, |
|
"reward_std": 0.1859412807971239, |
|
"rewards/cosine_scaled_reward": 0.004893161356449127, |
|
"rewards/format_reward": 0.5, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 1677.6875305175781, |
|
"epoch": 0.13028571428571428, |
|
"grad_norm": 0.2240653932094574, |
|
"kl": 0.0018825531005859375, |
|
"learning_rate": 9.55824636882301e-07, |
|
"loss": 0.0001, |
|
"reward": 0.27667421475052834, |
|
"reward_std": 0.15631183050572872, |
|
"rewards/cosine_scaled_reward": 0.16248321998864412, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 2820.479248046875, |
|
"epoch": 0.13142857142857142, |
|
"grad_norm": 0.1784657984972, |
|
"kl": 0.0027828216552734375, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1664815954864025, |
|
"reward_std": 0.1997350938618183, |
|
"rewards/cosine_scaled_reward": 0.08072170801460743, |
|
"rewards/format_reward": 0.4791666865348816, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 2746.4583740234375, |
|
"epoch": 0.13257142857142856, |
|
"grad_norm": 0.20344781875610352, |
|
"kl": 0.00470733642578125, |
|
"learning_rate": 9.530702921077358e-07, |
|
"loss": 0.0002, |
|
"reward": 0.04535680764820427, |
|
"reward_std": 0.10419335402548313, |
|
"rewards/cosine_scaled_reward": -0.08311295229941607, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 3282.0208740234375, |
|
"epoch": 0.1337142857142857, |
|
"grad_norm": 0.1357363760471344, |
|
"kl": 0.002735137939453125, |
|
"learning_rate": 9.516636183034564e-07, |
|
"loss": 0.0001, |
|
"reward": -0.02954237163066864, |
|
"reward_std": 0.14159450307488441, |
|
"rewards/cosine_scaled_reward": -0.1510828686878085, |
|
"rewards/format_reward": 0.1875, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 2032.229232788086, |
|
"epoch": 0.13485714285714287, |
|
"grad_norm": 0.27717867493629456, |
|
"kl": 0.01053619384765625, |
|
"learning_rate": 9.502373679810839e-07, |
|
"loss": 0.0004, |
|
"reward": 0.13773571141064167, |
|
"reward_std": 0.1858275569975376, |
|
"rewards/cosine_scaled_reward": -0.0683070570230484, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 2059.5834197998047, |
|
"epoch": 0.136, |
|
"grad_norm": 0.25735679268836975, |
|
"kl": 0.005496978759765625, |
|
"learning_rate": 9.487916106540465e-07, |
|
"loss": 0.0002, |
|
"reward": 0.10206396621651947, |
|
"reward_std": 0.13856840506196022, |
|
"rewards/cosine_scaled_reward": -0.10751124238595366, |
|
"rewards/format_reward": 0.6041666865348816, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 2900.416748046875, |
|
"epoch": 0.13714285714285715, |
|
"grad_norm": 0.22331129014492035, |
|
"kl": 0.0034637451171875, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0223141775932163, |
|
"reward_std": 0.2024293877184391, |
|
"rewards/cosine_scaled_reward": -0.18740470334887505, |
|
"rewards/format_reward": 0.4583333507180214, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 2943.8958740234375, |
|
"epoch": 0.1382857142857143, |
|
"grad_norm": 0.17821165919303894, |
|
"kl": 0.002651214599609375, |
|
"learning_rate": 9.458418577899774e-07, |
|
"loss": 0.0001, |
|
"reward": 0.04273420386016369, |
|
"reward_std": 0.1815544180572033, |
|
"rewards/cosine_scaled_reward": -0.09382643923163414, |
|
"rewards/format_reward": 0.354166679084301, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 2829.3958740234375, |
|
"epoch": 0.13942857142857143, |
|
"grad_norm": 0.21492768824100494, |
|
"kl": 0.005435943603515625, |
|
"learning_rate": 9.443380060197385e-07, |
|
"loss": 0.0002, |
|
"reward": 0.006649336777627468, |
|
"reward_std": 0.13518287613987923, |
|
"rewards/cosine_scaled_reward": -0.15551936253905296, |
|
"rewards/format_reward": 0.3333333358168602, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 2489.0000915527344, |
|
"epoch": 0.14057142857142857, |
|
"grad_norm": 0.5098468661308289, |
|
"kl": 0.02829742431640625, |
|
"learning_rate": 9.428149347714143e-07, |
|
"loss": 0.0011, |
|
"reward": 0.12213594932109118, |
|
"reward_std": 0.23784950748085976, |
|
"rewards/cosine_scaled_reward": -0.005025926977396011, |
|
"rewards/format_reward": 0.4791666828095913, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 2508.187530517578, |
|
"epoch": 0.1417142857142857, |
|
"grad_norm": 0.23071174323558807, |
|
"kl": 0.0027313232421875, |
|
"learning_rate": 9.412727182773486e-07, |
|
"loss": 0.0001, |
|
"reward": 0.15378618612885475, |
|
"reward_std": 0.2259940393269062, |
|
"rewards/cosine_scaled_reward": 0.025300168432295322, |
|
"rewards/format_reward": 0.5416666753590107, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 3111.6250610351562, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.1613474041223526, |
|
"kl": 0.0019855499267578125, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.0001, |
|
"reward": 0.00011996552348136902, |
|
"reward_std": 0.12452885881066322, |
|
"rewards/cosine_scaled_reward": -0.15928708389401436, |
|
"rewards/format_reward": 0.3125, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 2480.2293090820312, |
|
"epoch": 0.144, |
|
"grad_norm": 0.23107463121414185, |
|
"kl": 0.00391387939453125, |
|
"learning_rate": 9.381311511432658e-07, |
|
"loss": 0.0002, |
|
"reward": 0.10650707967579365, |
|
"reward_std": 0.2277711071074009, |
|
"rewards/cosine_scaled_reward": -0.06300333887338638, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 2718.9376220703125, |
|
"epoch": 0.14514285714285713, |
|
"grad_norm": 0.16975504159927368, |
|
"kl": 0.0023326873779296875, |
|
"learning_rate": 9.36531953618799e-07, |
|
"loss": 0.0001, |
|
"reward": 0.02971411682665348, |
|
"reward_std": 0.1977171078324318, |
|
"rewards/cosine_scaled_reward": -0.14219553396105766, |
|
"rewards/format_reward": 0.3958333469927311, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 2266.666778564453, |
|
"epoch": 0.1462857142857143, |
|
"grad_norm": 0.18879002332687378, |
|
"kl": 0.0033416748046875, |
|
"learning_rate": 9.34913917072228e-07, |
|
"loss": 0.0001, |
|
"reward": 0.15120768686756492, |
|
"reward_std": 0.20895353704690933, |
|
"rewards/cosine_scaled_reward": 0.01915425295010209, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 2129.500015258789, |
|
"epoch": 0.14742857142857144, |
|
"grad_norm": 0.26337873935699463, |
|
"kl": 0.0062255859375, |
|
"learning_rate": 9.332771203643714e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1327032782137394, |
|
"reward_std": 0.19304194673895836, |
|
"rewards/cosine_scaled_reward": -0.03845389559864998, |
|
"rewards/format_reward": 0.5833333488553762, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 2919.5208740234375, |
|
"epoch": 0.14857142857142858, |
|
"grad_norm": 0.1732492297887802, |
|
"kl": 0.00247955322265625, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.0001, |
|
"reward": 0.02012626640498638, |
|
"reward_std": 0.16613218560814857, |
|
"rewards/cosine_scaled_reward": -0.1293521734769456, |
|
"rewards/format_reward": 0.3333333507180214, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 2883.4375, |
|
"epoch": 0.14971428571428572, |
|
"grad_norm": 0.16014830768108368, |
|
"kl": 0.0026702880859375, |
|
"learning_rate": 9.299475664759068e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1132216090336442, |
|
"reward_std": 0.16543111577630043, |
|
"rewards/cosine_scaled_reward": -0.01893126592040062, |
|
"rewards/format_reward": 0.479166679084301, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 3016.6875610351562, |
|
"epoch": 0.15085714285714286, |
|
"grad_norm": 0.18878363072872162, |
|
"kl": 0.0045375823974609375, |
|
"learning_rate": 9.282549715730579e-07, |
|
"loss": 0.0002, |
|
"reward": 0.0002798512578010559, |
|
"reward_std": 0.1798754744231701, |
|
"rewards/cosine_scaled_reward": -0.13517173379659653, |
|
"rewards/format_reward": 0.27083334885537624, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 2867.4584350585938, |
|
"epoch": 0.152, |
|
"grad_norm": 0.18914277851581573, |
|
"kl": 0.004039764404296875, |
|
"learning_rate": 9.265439410565328e-07, |
|
"loss": 0.0002, |
|
"reward": 0.05362038780003786, |
|
"reward_std": 0.1794121377170086, |
|
"rewards/cosine_scaled_reward": -0.09410551190376282, |
|
"rewards/format_reward": 0.3958333432674408, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 1913.1042175292969, |
|
"epoch": 0.15314285714285714, |
|
"grad_norm": 0.27556517720222473, |
|
"kl": 0.00406646728515625, |
|
"learning_rate": 9.248145583195447e-07, |
|
"loss": 0.0002, |
|
"reward": 0.12812363170087337, |
|
"reward_std": 0.16640142910182476, |
|
"rewards/cosine_scaled_reward": -0.07728635333478451, |
|
"rewards/format_reward": 0.645833358168602, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 2429.604217529297, |
|
"epoch": 0.15428571428571428, |
|
"grad_norm": 0.21053838729858398, |
|
"kl": 0.003673553466796875, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.0001, |
|
"reward": 0.07179796043783426, |
|
"reward_std": 0.13138782046735287, |
|
"rewards/cosine_scaled_reward": -0.07374508306384087, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 1984.0834045410156, |
|
"epoch": 0.15542857142857142, |
|
"grad_norm": 0.26108235120773315, |
|
"kl": 0.003261566162109375, |
|
"learning_rate": 9.213010742252327e-07, |
|
"loss": 0.0001, |
|
"reward": 0.18163915304467082, |
|
"reward_std": 0.17089967243373394, |
|
"rewards/cosine_scaled_reward": -0.029379967600107193, |
|
"rewards/format_reward": 0.75, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 2869.9583740234375, |
|
"epoch": 0.15657142857142858, |
|
"grad_norm": 0.20500995218753815, |
|
"kl": 0.0051708221435546875, |
|
"learning_rate": 9.195171441101668e-07, |
|
"loss": 0.0002, |
|
"reward": -0.008513325825333595, |
|
"reward_std": 0.16568787395954132, |
|
"rewards/cosine_scaled_reward": -0.22801739536225796, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 1847.3125610351562, |
|
"epoch": 0.15771428571428572, |
|
"grad_norm": 0.2537723481655121, |
|
"kl": 0.003082275390625, |
|
"learning_rate": 9.177152042508077e-07, |
|
"loss": 0.0001, |
|
"reward": 0.2711644656956196, |
|
"reward_std": 0.20930937677621841, |
|
"rewards/cosine_scaled_reward": 0.16767838457599282, |
|
"rewards/format_reward": 0.708333358168602, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 2193.291748046875, |
|
"epoch": 0.15885714285714286, |
|
"grad_norm": 0.21132107079029083, |
|
"kl": 0.003009796142578125, |
|
"learning_rate": 9.158953424711624e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1569109088741243, |
|
"reward_std": 0.15476588159799576, |
|
"rewards/cosine_scaled_reward": -0.025194160640239716, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 2260.416717529297, |
|
"epoch": 0.16, |
|
"grad_norm": 0.2160802185535431, |
|
"kl": 0.00319671630859375, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.0001, |
|
"reward": 0.13407661230303347, |
|
"reward_std": 0.16906898841261864, |
|
"rewards/cosine_scaled_reward": -0.06701312679797411, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 2282.6875, |
|
"epoch": 0.16114285714285714, |
|
"grad_norm": 0.24112887680530548, |
|
"kl": 0.0041351318359375, |
|
"learning_rate": 9.122022088101613e-07, |
|
"loss": 0.0002, |
|
"reward": 0.13663070276379585, |
|
"reward_std": 0.1782714631408453, |
|
"rewards/cosine_scaled_reward": -0.027498777955770493, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 2172.166748046875, |
|
"epoch": 0.16228571428571428, |
|
"grad_norm": 0.2610005736351013, |
|
"kl": 0.003330230712890625, |
|
"learning_rate": 9.103291169269299e-07, |
|
"loss": 0.0001, |
|
"reward": 0.19508975371718407, |
|
"reward_std": 0.21969266794621944, |
|
"rewards/cosine_scaled_reward": 0.08089338196441531, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 2489.541717529297, |
|
"epoch": 0.16342857142857142, |
|
"grad_norm": 0.18988506495952606, |
|
"kl": 0.0038480758666992188, |
|
"learning_rate": 9.084384631108882e-07, |
|
"loss": 0.0002, |
|
"reward": 0.16926367208361626, |
|
"reward_std": 0.19583113491535187, |
|
"rewards/cosine_scaled_reward": 0.03587030619382858, |
|
"rewards/format_reward": 0.5833333544433117, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 2520.7084350585938, |
|
"epoch": 0.16457142857142856, |
|
"grad_norm": 0.20590053498744965, |
|
"kl": 0.00385284423828125, |
|
"learning_rate": 9.065303395098358e-07, |
|
"loss": 0.0002, |
|
"reward": 0.16642944514751434, |
|
"reward_std": 0.20522606186568737, |
|
"rewards/cosine_scaled_reward": 0.040587374940514565, |
|
"rewards/format_reward": 0.5625000260770321, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 2526.166748046875, |
|
"epoch": 0.1657142857142857, |
|
"grad_norm": 0.23322638869285583, |
|
"kl": 0.0052337646484375, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": 0.0002, |
|
"reward": 0.045561966486275196, |
|
"reward_std": 0.2455296441912651, |
|
"rewards/cosine_scaled_reward": -0.14990929747000337, |
|
"rewards/format_reward": 0.4791666865348816, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 2197.1458740234375, |
|
"epoch": 0.16685714285714287, |
|
"grad_norm": 0.22786113619804382, |
|
"kl": 0.00408172607421875, |
|
"learning_rate": 9.026620557966279e-07, |
|
"loss": 0.0002, |
|
"reward": 0.02551903622224927, |
|
"reward_std": 0.1334901675581932, |
|
"rewards/cosine_scaled_reward": -0.25134460628032684, |
|
"rewards/format_reward": 0.6041666772216558, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 2167.7291717529297, |
|
"epoch": 0.168, |
|
"grad_norm": 0.255658894777298, |
|
"kl": 0.00374603271484375, |
|
"learning_rate": 9.007020842191634e-07, |
|
"loss": 0.0001, |
|
"reward": 0.22885112185031176, |
|
"reward_std": 0.1546173021197319, |
|
"rewards/cosine_scaled_reward": 0.14800470299087465, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 2297.666748046875, |
|
"epoch": 0.16914285714285715, |
|
"grad_norm": 0.2738894820213318, |
|
"kl": 0.00521087646484375, |
|
"learning_rate": 8.987250199168808e-07, |
|
"loss": 0.0002, |
|
"reward": 0.03275088965892792, |
|
"reward_std": 0.1455500442534685, |
|
"rewards/cosine_scaled_reward": -0.2082662135362625, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 2671.7709350585938, |
|
"epoch": 0.1702857142857143, |
|
"grad_norm": 0.20778509974479675, |
|
"kl": 0.004669189453125, |
|
"learning_rate": 8.967309592491052e-07, |
|
"loss": 0.0002, |
|
"reward": 0.17761395254638046, |
|
"reward_std": 0.18856573849916458, |
|
"rewards/cosine_scaled_reward": 0.08179567754268646, |
|
"rewards/format_reward": 0.5208333432674408, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 2050.166717529297, |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.22614718973636627, |
|
"kl": 0.005207061767578125, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0002, |
|
"reward": 0.07955310121178627, |
|
"reward_std": 0.1677793301641941, |
|
"rewards/cosine_scaled_reward": -0.17033380083739758, |
|
"rewards/format_reward": 0.6458333395421505, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 2235.6459197998047, |
|
"epoch": 0.17257142857142857, |
|
"grad_norm": 0.19895651936531067, |
|
"kl": 0.0030078887939453125, |
|
"learning_rate": 8.926922383915315e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0866355006583035, |
|
"reward_std": 0.10392699390649796, |
|
"rewards/cosine_scaled_reward": -0.09595790691673756, |
|
"rewards/format_reward": 0.5208333432674408, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 3005.9583740234375, |
|
"epoch": 0.1737142857142857, |
|
"grad_norm": 0.31027477979660034, |
|
"kl": 0.0070343017578125, |
|
"learning_rate": 8.906477750432903e-07, |
|
"loss": 0.0003, |
|
"reward": -0.002403062768280506, |
|
"reward_std": 0.1533808410167694, |
|
"rewards/cosine_scaled_reward": -0.13117293268442154, |
|
"rewards/format_reward": 0.25000001303851604, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 2327.3541870117188, |
|
"epoch": 0.17485714285714285, |
|
"grad_norm": 0.20010127127170563, |
|
"kl": 0.0037221908569335938, |
|
"learning_rate": 8.88586709003076e-07, |
|
"loss": 0.0001, |
|
"reward": 0.23278480861335993, |
|
"reward_std": 0.19316110759973526, |
|
"rewards/cosine_scaled_reward": 0.12676820158958435, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 2711.229248046875, |
|
"epoch": 0.176, |
|
"grad_norm": 0.2796197831630707, |
|
"kl": 0.00414276123046875, |
|
"learning_rate": 8.865091407243394e-07, |
|
"loss": 0.0002, |
|
"reward": 0.15084031783044338, |
|
"reward_std": 0.14097959361970425, |
|
"rewards/cosine_scaled_reward": 0.0931478925049305, |
|
"rewards/format_reward": 0.39583333395421505, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 2727.6875610351562, |
|
"epoch": 0.17714285714285713, |
|
"grad_norm": 0.2060837596654892, |
|
"kl": 0.0041351318359375, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": 0.0002, |
|
"reward": 0.026034665293991566, |
|
"reward_std": 0.20783771388232708, |
|
"rewards/cosine_scaled_reward": -0.17815358191728592, |
|
"rewards/format_reward": 0.4583333395421505, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 2652.3125610351562, |
|
"epoch": 0.1782857142857143, |
|
"grad_norm": 0.1977294683456421, |
|
"kl": 0.00499725341796875, |
|
"learning_rate": 8.823049032816478e-07, |
|
"loss": 0.0002, |
|
"reward": -0.00022369646467268467, |
|
"reward_std": 0.10643414594233036, |
|
"rewards/cosine_scaled_reward": -0.2093517892062664, |
|
"rewards/format_reward": 0.4166666828095913, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 2042.7292175292969, |
|
"epoch": 0.17942857142857144, |
|
"grad_norm": 0.25187933444976807, |
|
"kl": 0.0039520263671875, |
|
"learning_rate": 8.801784390262943e-07, |
|
"loss": 0.0002, |
|
"reward": 0.16695543192327023, |
|
"reward_std": 0.23595153540372849, |
|
"rewards/cosine_scaled_reward": -0.006288483738899231, |
|
"rewards/format_reward": 0.645833358168602, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 2814.729217529297, |
|
"epoch": 0.18057142857142858, |
|
"grad_norm": 0.17815591394901276, |
|
"kl": 0.00458526611328125, |
|
"learning_rate": 8.780358823396352e-07, |
|
"loss": 0.0002, |
|
"reward": -0.004143957048654556, |
|
"reward_std": 0.11188133526593447, |
|
"rewards/cosine_scaled_reward": -0.22808771207928658, |
|
"rewards/format_reward": 0.43750000558793545, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 1992.2500610351562, |
|
"epoch": 0.18171428571428572, |
|
"grad_norm": 0.21933415532112122, |
|
"kl": 0.004100799560546875, |
|
"learning_rate": 8.758773376468604e-07, |
|
"loss": 0.0002, |
|
"reward": 0.2522421330213547, |
|
"reward_std": 0.22296695411205292, |
|
"rewards/cosine_scaled_reward": 0.08081851835595444, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 2079.416748046875, |
|
"epoch": 0.18285714285714286, |
|
"grad_norm": 0.19679833948612213, |
|
"kl": 0.003993988037109375, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": 0.0002, |
|
"reward": 0.18703680613543838, |
|
"reward_std": 0.260627418756485, |
|
"rewards/cosine_scaled_reward": 0.00598154217004776, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 3009.541717529297, |
|
"epoch": 0.184, |
|
"grad_norm": 0.24472463130950928, |
|
"kl": 0.005504608154296875, |
|
"learning_rate": 8.715127058347614e-07, |
|
"loss": 0.0002, |
|
"reward": 0.014636407606303692, |
|
"reward_std": 0.1625995971262455, |
|
"rewards/cosine_scaled_reward": -0.15822336450219154, |
|
"rewards/format_reward": 0.37500000931322575, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 2617.3751220703125, |
|
"epoch": 0.18514285714285714, |
|
"grad_norm": 0.18487517535686493, |
|
"kl": 0.0045013427734375, |
|
"learning_rate": 8.693068314414344e-07, |
|
"loss": 0.0002, |
|
"reward": 0.11570010334253311, |
|
"reward_std": 0.17325943149626255, |
|
"rewards/cosine_scaled_reward": -0.03621460869908333, |
|
"rewards/format_reward": 0.5208333395421505, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 2124.4584045410156, |
|
"epoch": 0.18628571428571428, |
|
"grad_norm": 0.3504065275192261, |
|
"kl": 0.006938934326171875, |
|
"learning_rate": 8.670853944836176e-07, |
|
"loss": 0.0003, |
|
"reward": 0.11949230777099729, |
|
"reward_std": 0.21487346105277538, |
|
"rewards/cosine_scaled_reward": -0.061273553408682346, |
|
"rewards/format_reward": 0.5833333507180214, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 2052.791717529297, |
|
"epoch": 0.18742857142857142, |
|
"grad_norm": 0.27968698740005493, |
|
"kl": 0.00577545166015625, |
|
"learning_rate": 8.648485032310144e-07, |
|
"loss": 0.0002, |
|
"reward": 0.0795913627371192, |
|
"reward_std": 0.21591341868042946, |
|
"rewards/cosine_scaled_reward": -0.12282621720805764, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 2127.2708740234375, |
|
"epoch": 0.18857142857142858, |
|
"grad_norm": 0.21363244950771332, |
|
"kl": 0.004779815673828125, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0002, |
|
"reward": 0.19190740585327148, |
|
"reward_std": 0.221625704318285, |
|
"rewards/cosine_scaled_reward": 0.05086267925798893, |
|
"rewards/format_reward": 0.6458333395421505, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 1618.8959350585938, |
|
"epoch": 0.18971428571428572, |
|
"grad_norm": 0.21441620588302612, |
|
"kl": 0.0042877197265625, |
|
"learning_rate": 8.603287946810513e-07, |
|
"loss": 0.0002, |
|
"reward": 0.14088203478604555, |
|
"reward_std": 0.12707579229027033, |
|
"rewards/cosine_scaled_reward": -0.08422760479152203, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 2476.7708740234375, |
|
"epoch": 0.19085714285714286, |
|
"grad_norm": 0.21957609057426453, |
|
"kl": 0.005298614501953125, |
|
"learning_rate": 8.580461976679099e-07, |
|
"loss": 0.0002, |
|
"reward": -0.01745962956920266, |
|
"reward_std": 0.12951701134443283, |
|
"rewards/cosine_scaled_reward": -0.2852119877934456, |
|
"rewards/format_reward": 0.5000000111758709, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 1929.354248046875, |
|
"epoch": 0.192, |
|
"grad_norm": 0.23081496357917786, |
|
"kl": 0.004123687744140625, |
|
"learning_rate": 8.557485869176825e-07, |
|
"loss": 0.0002, |
|
"reward": 0.13488853815943003, |
|
"reward_std": 0.18247101455926895, |
|
"rewards/cosine_scaled_reward": -0.1040015157777816, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 2015.229232788086, |
|
"epoch": 0.19314285714285714, |
|
"grad_norm": 0.28857842087745667, |
|
"kl": 0.004528045654296875, |
|
"learning_rate": 8.534360744126753e-07, |
|
"loss": 0.0002, |
|
"reward": 0.26697871275246143, |
|
"reward_std": 0.20995871722698212, |
|
"rewards/cosine_scaled_reward": 0.1499726166948676, |
|
"rewards/format_reward": 0.729166679084301, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 2331.750030517578, |
|
"epoch": 0.19428571428571428, |
|
"grad_norm": 0.20006102323532104, |
|
"kl": 0.00605010986328125, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.0002, |
|
"reward": 0.08345442125573754, |
|
"reward_std": 0.19285878352820873, |
|
"rewards/cosine_scaled_reward": -0.10043650306761265, |
|
"rewards/format_reward": 0.5208333432674408, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 1634.4791870117188, |
|
"epoch": 0.19542857142857142, |
|
"grad_norm": 0.26344433426856995, |
|
"kl": 0.00447845458984375, |
|
"learning_rate": 8.487667956935087e-07, |
|
"loss": 0.0002, |
|
"reward": 0.10782040096819401, |
|
"reward_std": 0.13615941908210516, |
|
"rewards/cosine_scaled_reward": -0.16944693960249424, |
|
"rewards/format_reward": 0.7500000055879354, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 1950.7292022705078, |
|
"epoch": 0.19657142857142856, |
|
"grad_norm": 0.26507794857025146, |
|
"kl": 0.005542755126953125, |
|
"learning_rate": 8.464102570534061e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1530790887773037, |
|
"reward_std": 0.220405962318182, |
|
"rewards/cosine_scaled_reward": -0.01874719187617302, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 2552.2500915527344, |
|
"epoch": 0.1977142857142857, |
|
"grad_norm": 0.29694148898124695, |
|
"kl": 0.0063629150390625, |
|
"learning_rate": 8.440392717955475e-07, |
|
"loss": 0.0003, |
|
"reward": 0.12018180638551712, |
|
"reward_std": 0.16073662787675858, |
|
"rewards/cosine_scaled_reward": -0.01620076596736908, |
|
"rewards/format_reward": 0.4791666679084301, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 2430.291717529297, |
|
"epoch": 0.19885714285714284, |
|
"grad_norm": 0.18767432868480682, |
|
"kl": 0.007476806640625, |
|
"learning_rate": 8.416539554784089e-07, |
|
"loss": 0.0003, |
|
"reward": 0.03346575051546097, |
|
"reward_std": 0.11665205657482147, |
|
"rewards/cosine_scaled_reward": -0.1950590880587697, |
|
"rewards/format_reward": 0.5208333432674408, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 2481.187545776367, |
|
"epoch": 0.2, |
|
"grad_norm": 0.26425570249557495, |
|
"kl": 0.00807952880859375, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0003, |
|
"reward": 0.14548239950090647, |
|
"reward_std": 0.17145178094506264, |
|
"rewards/cosine_scaled_reward": 0.005486873909831047, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 1953.3125305175781, |
|
"epoch": 0.20114285714285715, |
|
"grad_norm": 0.2169903814792633, |
|
"kl": 0.006252288818359375, |
|
"learning_rate": 8.368407953869103e-07, |
|
"loss": 0.0003, |
|
"reward": 0.15049799345433712, |
|
"reward_std": 0.18127938732504845, |
|
"rewards/cosine_scaled_reward": -0.06541152065619826, |
|
"rewards/format_reward": 0.708333358168602, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 1905.7083740234375, |
|
"epoch": 0.2022857142857143, |
|
"grad_norm": 0.2057493031024933, |
|
"kl": 0.0059356689453125, |
|
"learning_rate": 8.344131861991828e-07, |
|
"loss": 0.0002, |
|
"reward": 0.20941531658172607, |
|
"reward_std": 0.18847975879907608, |
|
"rewards/cosine_scaled_reward": 0.06081422168063, |
|
"rewards/format_reward": 0.6875000298023224, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 1686.2709045410156, |
|
"epoch": 0.20342857142857143, |
|
"grad_norm": 0.2742275595664978, |
|
"kl": 0.00555419921875, |
|
"learning_rate": 8.319717151140072e-07, |
|
"loss": 0.0002, |
|
"reward": 0.282832570374012, |
|
"reward_std": 0.15610528737306595, |
|
"rewards/cosine_scaled_reward": 0.13618910312652588, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 2144.1251220703125, |
|
"epoch": 0.20457142857142857, |
|
"grad_norm": 0.20610445737838745, |
|
"kl": 0.00775909423828125, |
|
"learning_rate": 8.295165011252396e-07, |
|
"loss": 0.0003, |
|
"reward": 0.18123364634811878, |
|
"reward_std": 0.16907534934580326, |
|
"rewards/cosine_scaled_reward": 0.023578599095344543, |
|
"rewards/format_reward": 0.6458333432674408, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 2225.3959350585938, |
|
"epoch": 0.2057142857142857, |
|
"grad_norm": 0.2948279082775116, |
|
"kl": 0.008434295654296875, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": 0.0003, |
|
"reward": 0.10301533341407776, |
|
"reward_std": 0.23226897418498993, |
|
"rewards/cosine_scaled_reward": -0.09574815258383751, |
|
"rewards/format_reward": 0.5833333488553762, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 2334.104217529297, |
|
"epoch": 0.20685714285714285, |
|
"grad_norm": 0.21989773213863373, |
|
"kl": 0.004322052001953125, |
|
"learning_rate": 8.245653237555705e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1680104963015765, |
|
"reward_std": 0.2741067036986351, |
|
"rewards/cosine_scaled_reward": -0.010420721024274826, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 1716.2500305175781, |
|
"epoch": 0.208, |
|
"grad_norm": 0.2736404240131378, |
|
"kl": 0.006591796875, |
|
"learning_rate": 8.220696016880687e-07, |
|
"loss": 0.0003, |
|
"reward": 0.19305634032934904, |
|
"reward_std": 0.17649215832352638, |
|
"rewards/cosine_scaled_reward": -0.020499907433986664, |
|
"rewards/format_reward": 0.7708333507180214, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 1984.6459045410156, |
|
"epoch": 0.20914285714285713, |
|
"grad_norm": 0.24810387194156647, |
|
"kl": 0.00687408447265625, |
|
"learning_rate": 8.195606193320136e-07, |
|
"loss": 0.0003, |
|
"reward": 0.11633813101798296, |
|
"reward_std": 0.19054009392857552, |
|
"rewards/cosine_scaled_reward": -0.16023868951015174, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 2012.0834045410156, |
|
"epoch": 0.2102857142857143, |
|
"grad_norm": 0.24479207396507263, |
|
"kl": 0.00539398193359375, |
|
"learning_rate": 8.170384989716657e-07, |
|
"loss": 0.0002, |
|
"reward": 0.20850215945392847, |
|
"reward_std": 0.24574441090226173, |
|
"rewards/cosine_scaled_reward": 0.05496228300035, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 2561.9166870117188, |
|
"epoch": 0.21142857142857144, |
|
"grad_norm": 0.23638688027858734, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.0003, |
|
"reward": 0.037306661397451535, |
|
"reward_std": 0.1875557340681553, |
|
"rewards/cosine_scaled_reward": -0.17976870480924845, |
|
"rewards/format_reward": 0.5000000260770321, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 2453.5625610351562, |
|
"epoch": 0.21257142857142858, |
|
"grad_norm": 0.30098989605903625, |
|
"kl": 0.0072784423828125, |
|
"learning_rate": 8.119553365707802e-07, |
|
"loss": 0.0003, |
|
"reward": 0.15216808393597603, |
|
"reward_std": 0.17800051532685757, |
|
"rewards/cosine_scaled_reward": -0.01129375584423542, |
|
"rewards/format_reward": 0.6041666716337204, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 2095.1250610351562, |
|
"epoch": 0.21371428571428572, |
|
"grad_norm": 0.20456546545028687, |
|
"kl": 0.00882720947265625, |
|
"learning_rate": 8.093945422764069e-07, |
|
"loss": 0.0004, |
|
"reward": 0.27147069573402405, |
|
"reward_std": 0.18336978182196617, |
|
"rewards/cosine_scaled_reward": 0.13264761865139008, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 1552.937515258789, |
|
"epoch": 0.21485714285714286, |
|
"grad_norm": 0.25879716873168945, |
|
"kl": 0.00807952880859375, |
|
"learning_rate": 8.068211054579943e-07, |
|
"loss": 0.0003, |
|
"reward": 0.29217225313186646, |
|
"reward_std": 0.23792832344770432, |
|
"rewards/cosine_scaled_reward": 0.1374435918405652, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 1397.5000305175781, |
|
"epoch": 0.216, |
|
"grad_norm": 0.2867569625377655, |
|
"kl": 0.00785064697265625, |
|
"learning_rate": 8.04235151541222e-07, |
|
"loss": 0.0003, |
|
"reward": 0.16837791539728642, |
|
"reward_std": 0.17263205349445343, |
|
"rewards/cosine_scaled_reward": -0.12355193216353655, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 2409.7501220703125, |
|
"epoch": 0.21714285714285714, |
|
"grad_norm": 0.23366515338420868, |
|
"kl": 0.00925445556640625, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.0004, |
|
"reward": 0.08992326661245897, |
|
"reward_std": 0.2077214140444994, |
|
"rewards/cosine_scaled_reward": -0.05378926917910576, |
|
"rewards/format_reward": 0.4583333469927311, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 1840.2708587646484, |
|
"epoch": 0.21828571428571428, |
|
"grad_norm": 0.24777474999427795, |
|
"kl": 0.006641387939453125, |
|
"learning_rate": 7.990261971595048e-07, |
|
"loss": 0.0003, |
|
"reward": 0.1860494278371334, |
|
"reward_std": 0.18764295056462288, |
|
"rewards/cosine_scaled_reward": -0.014890416525304317, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 2362.5208740234375, |
|
"epoch": 0.21942857142857142, |
|
"grad_norm": 0.24280861020088196, |
|
"kl": 0.0124053955078125, |
|
"learning_rate": 7.964034505716476e-07, |
|
"loss": 0.0005, |
|
"reward": 0.0900897765532136, |
|
"reward_std": 0.21396854892373085, |
|
"rewards/cosine_scaled_reward": -0.11646672445931472, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 1967.1250915527344, |
|
"epoch": 0.22057142857142858, |
|
"grad_norm": 0.24913279712200165, |
|
"kl": 0.01198577880859375, |
|
"learning_rate": 7.93768694627233e-07, |
|
"loss": 0.0005, |
|
"reward": 0.12212350871413946, |
|
"reward_std": 0.1075000325217843, |
|
"rewards/cosine_scaled_reward": -0.1432434804737568, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 940.7083740234375, |
|
"epoch": 0.22171428571428572, |
|
"grad_norm": 0.30179327726364136, |
|
"kl": 0.0063323974609375, |
|
"learning_rate": 7.911220577405484e-07, |
|
"loss": 0.0003, |
|
"reward": 0.4154938831925392, |
|
"reward_std": 0.16113552078604698, |
|
"rewards/cosine_scaled_reward": 0.308564942330122, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 1250.1042175292969, |
|
"epoch": 0.22285714285714286, |
|
"grad_norm": 0.27229517698287964, |
|
"kl": 0.0104827880859375, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.0004, |
|
"reward": 0.3013657033443451, |
|
"reward_std": 0.14612397830933332, |
|
"rewards/cosine_scaled_reward": 0.10115441353991628, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 2639.7709350585938, |
|
"epoch": 0.224, |
|
"grad_norm": 0.27179622650146484, |
|
"kl": 0.0116729736328125, |
|
"learning_rate": 7.857936576865356e-07, |
|
"loss": 0.0005, |
|
"reward": 0.08495328156277537, |
|
"reward_std": 0.19798987358808517, |
|
"rewards/cosine_scaled_reward": -0.04823115328326821, |
|
"rewards/format_reward": 0.41666667722165585, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 2250.2708740234375, |
|
"epoch": 0.22514285714285714, |
|
"grad_norm": 0.2959842383861542, |
|
"kl": 0.00969696044921875, |
|
"learning_rate": 7.831121542179086e-07, |
|
"loss": 0.0004, |
|
"reward": 0.05214718542993069, |
|
"reward_std": 0.17444902658462524, |
|
"rewards/cosine_scaled_reward": -0.19195930659770966, |
|
"rewards/format_reward": 0.5833333544433117, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 1825.916748046875, |
|
"epoch": 0.22628571428571428, |
|
"grad_norm": 0.21343165636062622, |
|
"kl": 0.00766754150390625, |
|
"learning_rate": 7.804192891917571e-07, |
|
"loss": 0.0003, |
|
"reward": 0.20536806993186474, |
|
"reward_std": 0.19084695354104042, |
|
"rewards/cosine_scaled_reward": -0.04184096306562424, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 1967.9167175292969, |
|
"epoch": 0.22742857142857142, |
|
"grad_norm": 0.22258897125720978, |
|
"kl": 0.00786590576171875, |
|
"learning_rate": 7.777151938545235e-07, |
|
"loss": 0.0003, |
|
"reward": 0.30358556658029556, |
|
"reward_std": 0.20891420915722847, |
|
"rewards/cosine_scaled_reward": 0.15997123159468174, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 1702.916748046875, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.3150264322757721, |
|
"kl": 0.00884246826171875, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0004, |
|
"reward": 0.23448583600111306, |
|
"reward_std": 0.21412995643913746, |
|
"rewards/cosine_scaled_reward": 0.06124690920114517, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 1527.5625610351562, |
|
"epoch": 0.2297142857142857, |
|
"grad_norm": 0.23923619091510773, |
|
"kl": 0.0072479248046875, |
|
"learning_rate": 7.72273839962904e-07, |
|
"loss": 0.0003, |
|
"reward": 0.198734937235713, |
|
"reward_std": 0.17404086515307426, |
|
"rewards/cosine_scaled_reward": -0.06422888732049614, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 1109.9375457763672, |
|
"epoch": 0.23085714285714284, |
|
"grad_norm": 0.25801074504852295, |
|
"kl": 0.00966644287109375, |
|
"learning_rate": 7.695368466124296e-07, |
|
"loss": 0.0004, |
|
"reward": 0.23950592055916786, |
|
"reward_std": 0.1612282581627369, |
|
"rewards/cosine_scaled_reward": -0.01809925213456154, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 1692.8333740234375, |
|
"epoch": 0.232, |
|
"grad_norm": 0.31517601013183594, |
|
"kl": 0.0102996826171875, |
|
"learning_rate": 7.667891533457718e-07, |
|
"loss": 0.0004, |
|
"reward": 0.24487797170877457, |
|
"reward_std": 0.21635670214891434, |
|
"rewards/cosine_scaled_reward": 0.08476148918271065, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 1215.8750305175781, |
|
"epoch": 0.23314285714285715, |
|
"grad_norm": 0.2657667398452759, |
|
"kl": 0.00862884521484375, |
|
"learning_rate": 7.640308940816239e-07, |
|
"loss": 0.0003, |
|
"reward": 0.2890103794634342, |
|
"reward_std": 0.24866387993097305, |
|
"rewards/cosine_scaled_reward": 0.1112285777926445, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 1525.3333435058594, |
|
"epoch": 0.2342857142857143, |
|
"grad_norm": 0.21206864714622498, |
|
"kl": 0.00908660888671875, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": 0.0004, |
|
"reward": 0.23077455908060074, |
|
"reward_std": 0.1799892894923687, |
|
"rewards/cosine_scaled_reward": -0.032498230517376214, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 2053.3125610351562, |
|
"epoch": 0.23542857142857143, |
|
"grad_norm": 0.28847047686576843, |
|
"kl": 0.0101165771484375, |
|
"learning_rate": 7.584832158039378e-07, |
|
"loss": 0.0004, |
|
"reward": 0.11000457312911749, |
|
"reward_std": 0.18665008433163166, |
|
"rewards/cosine_scaled_reward": -0.14993075653910637, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 1849.9167175292969, |
|
"epoch": 0.23657142857142857, |
|
"grad_norm": 0.19775180518627167, |
|
"kl": 0.006805419921875, |
|
"learning_rate": 7.556940671764124e-07, |
|
"loss": 0.0003, |
|
"reward": 0.2954826056957245, |
|
"reward_std": 0.25379542633891106, |
|
"rewards/cosine_scaled_reward": 0.12750269658863544, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 1929.041748046875, |
|
"epoch": 0.2377142857142857, |
|
"grad_norm": 0.25739434361457825, |
|
"kl": 0.009552001953125, |
|
"learning_rate": 7.528948933102438e-07, |
|
"loss": 0.0004, |
|
"reward": 0.1019433755427599, |
|
"reward_std": 0.13344106450676918, |
|
"rewards/cosine_scaled_reward": -0.15905495546758175, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 1011.3333740234375, |
|
"epoch": 0.23885714285714285, |
|
"grad_norm": 0.3413412272930145, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 7.500858306332172e-07, |
|
"loss": 0.0003, |
|
"reward": 0.17312408424913883, |
|
"reward_std": 0.20489512011408806, |
|
"rewards/cosine_scaled_reward": -0.11678014509379864, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 1356.7084045410156, |
|
"epoch": 0.24, |
|
"grad_norm": 0.23836910724639893, |
|
"kl": 0.00788116455078125, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": 0.0003, |
|
"reward": 0.31968575716018677, |
|
"reward_std": 0.156296506524086, |
|
"rewards/cosine_scaled_reward": 0.15581995248794556, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 2032.6458740234375, |
|
"epoch": 0.24114285714285713, |
|
"grad_norm": 0.23086276650428772, |
|
"kl": 0.0112152099609375, |
|
"learning_rate": 7.444385869608921e-07, |
|
"loss": 0.0004, |
|
"reward": 0.13852777890861034, |
|
"reward_std": 0.1855682022869587, |
|
"rewards/cosine_scaled_reward": -0.12528848741203547, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 1474.5416717529297, |
|
"epoch": 0.2422857142857143, |
|
"grad_norm": 0.2737053632736206, |
|
"kl": 0.009735107421875, |
|
"learning_rate": 7.416006812042827e-07, |
|
"loss": 0.0004, |
|
"reward": 0.2501720953732729, |
|
"reward_std": 0.19472362473607063, |
|
"rewards/cosine_scaled_reward": 0.0749430526047945, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 1883.7500610351562, |
|
"epoch": 0.24342857142857144, |
|
"grad_norm": 0.2124582827091217, |
|
"kl": 0.01052093505859375, |
|
"learning_rate": 7.387534371007797e-07, |
|
"loss": 0.0004, |
|
"reward": 0.0875893197953701, |
|
"reward_std": 0.12948580272495747, |
|
"rewards/cosine_scaled_reward": -0.19460760243237019, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 1905.2500610351562, |
|
"epoch": 0.24457142857142858, |
|
"grad_norm": 0.1766165941953659, |
|
"kl": 0.0074462890625, |
|
"learning_rate": 7.358969934210438e-07, |
|
"loss": 0.0003, |
|
"reward": 0.17481125239282846, |
|
"reward_std": 0.18312595039606094, |
|
"rewards/cosine_scaled_reward": -0.08106975071132183, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 1422.0209045410156, |
|
"epoch": 0.24571428571428572, |
|
"grad_norm": 0.20306651294231415, |
|
"kl": 0.00847625732421875, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.0003, |
|
"reward": 0.2214849442243576, |
|
"reward_std": 0.23549797013401985, |
|
"rewards/cosine_scaled_reward": -0.06182933505624533, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 1808.1458740234375, |
|
"epoch": 0.24685714285714286, |
|
"grad_norm": 0.18333780765533447, |
|
"kl": 0.00699615478515625, |
|
"learning_rate": 7.301570646506027e-07, |
|
"loss": 0.0003, |
|
"reward": 0.1494435027707368, |
|
"reward_std": 0.15389228984713554, |
|
"rewards/cosine_scaled_reward": -0.15077029541134834, |
|
"rewards/format_reward": 0.875, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 1527.5833740234375, |
|
"epoch": 0.248, |
|
"grad_norm": 0.23145990073680878, |
|
"kl": 0.0092010498046875, |
|
"learning_rate": 7.27273859315928e-07, |
|
"loss": 0.0004, |
|
"reward": 0.2188200056552887, |
|
"reward_std": 0.15189463831484318, |
|
"rewards/cosine_scaled_reward": -0.015956051647663116, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 1596.1250305175781, |
|
"epoch": 0.24914285714285714, |
|
"grad_norm": 0.22957631945610046, |
|
"kl": 0.00939178466796875, |
|
"learning_rate": 7.243820139034464e-07, |
|
"loss": 0.0004, |
|
"reward": 0.2616739912191406, |
|
"reward_std": 0.1865678783506155, |
|
"rewards/cosine_scaled_reward": 0.09852963499724865, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 1126.3541870117188, |
|
"epoch": 0.2502857142857143, |
|
"grad_norm": 0.24680417776107788, |
|
"kl": 0.0069427490234375, |
|
"learning_rate": 7.214816693576234e-07, |
|
"loss": 0.0003, |
|
"reward": 0.21046569012105465, |
|
"reward_std": 0.19767357036471367, |
|
"rewards/cosine_scaled_reward": -0.08539294765796512, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 1128.4583740234375, |
|
"epoch": 0.25142857142857145, |
|
"grad_norm": 0.2578405439853668, |
|
"kl": 0.009613037109375, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": 0.0004, |
|
"reward": 0.23143617436289787, |
|
"reward_std": 0.1721612773835659, |
|
"rewards/cosine_scaled_reward": -0.04716856777667999, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 1951.7709045410156, |
|
"epoch": 0.25257142857142856, |
|
"grad_norm": 0.33379021286964417, |
|
"kl": 0.0142974853515625, |
|
"learning_rate": 7.156560487081051e-07, |
|
"loss": 0.0006, |
|
"reward": 0.13845073012635112, |
|
"reward_std": 0.14940405637025833, |
|
"rewards/cosine_scaled_reward": -0.045577242970466614, |
|
"rewards/format_reward": 0.6250000223517418, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 2032.8125305175781, |
|
"epoch": 0.2537142857142857, |
|
"grad_norm": 0.20354242622852325, |
|
"kl": 0.009674072265625, |
|
"learning_rate": 7.127310565369415e-07, |
|
"loss": 0.0004, |
|
"reward": 0.041867110412567854, |
|
"reward_std": 0.11502710357308388, |
|
"rewards/cosine_scaled_reward": -0.28463663905858994, |
|
"rewards/format_reward": 0.7291666716337204, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 1453.1250305175781, |
|
"epoch": 0.25485714285714284, |
|
"grad_norm": 0.22267840802669525, |
|
"kl": 0.0079193115234375, |
|
"learning_rate": 7.097981330836616e-07, |
|
"loss": 0.0003, |
|
"reward": 0.1836735513061285, |
|
"reward_std": 0.16883151605725288, |
|
"rewards/cosine_scaled_reward": -0.10032966919243336, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 1780.9583740234375, |
|
"epoch": 0.256, |
|
"grad_norm": 0.2752217948436737, |
|
"kl": 0.00952911376953125, |
|
"learning_rate": 7.068574212948169e-07, |
|
"loss": 0.0004, |
|
"reward": 0.21352362632751465, |
|
"reward_std": 0.1610415056347847, |
|
"rewards/cosine_scaled_reward": -0.00915946438908577, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 1222.2708435058594, |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.21294601261615753, |
|
"kl": 0.007015228271484375, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0003, |
|
"reward": 0.3245595395565033, |
|
"reward_std": 0.10259661450982094, |
|
"rewards/cosine_scaled_reward": 0.1599423922598362, |
|
"rewards/format_reward": 0.9375, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 1372.7708587646484, |
|
"epoch": 0.2582857142857143, |
|
"grad_norm": 0.3051982522010803, |
|
"kl": 0.012664794921875, |
|
"learning_rate": 7.009532063876148e-07, |
|
"loss": 0.0005, |
|
"reward": 0.09605667972937226, |
|
"reward_std": 0.11557916086167097, |
|
"rewards/cosine_scaled_reward": -0.24685227498412132, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 1197.4375305175781, |
|
"epoch": 0.25942857142857145, |
|
"grad_norm": 0.27726492285728455, |
|
"kl": 0.0123291015625, |
|
"learning_rate": 6.979899910323624e-07, |
|
"loss": 0.0005, |
|
"reward": 0.21336308866739273, |
|
"reward_std": 0.1688038632273674, |
|
"rewards/cosine_scaled_reward": -0.05070815235376358, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 1539.5208740234375, |
|
"epoch": 0.26057142857142856, |
|
"grad_norm": 0.28033754229545593, |
|
"kl": 0.01001739501953125, |
|
"learning_rate": 6.950195628537299e-07, |
|
"loss": 0.0004, |
|
"reward": 0.1344006136059761, |
|
"reward_std": 0.18115575425326824, |
|
"rewards/cosine_scaled_reward": -0.14776835404336452, |
|
"rewards/format_reward": 0.8125, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 1288.6250457763672, |
|
"epoch": 0.26171428571428573, |
|
"grad_norm": 0.3743990361690521, |
|
"kl": 0.0155181884765625, |
|
"learning_rate": 6.920420666261961e-07, |
|
"loss": 0.0006, |
|
"reward": 0.15809894306585193, |
|
"reward_std": 0.15935586765408516, |
|
"rewards/cosine_scaled_reward": -0.12152018398046494, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 1595.8750305175781, |
|
"epoch": 0.26285714285714284, |
|
"grad_norm": 0.32262828946113586, |
|
"kl": 0.01462554931640625, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.0006, |
|
"reward": 0.22118227370083332, |
|
"reward_std": 0.22235484700649977, |
|
"rewards/cosine_scaled_reward": 0.02892589569091797, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 1655.1250610351562, |
|
"epoch": 0.264, |
|
"grad_norm": 0.2636210322380066, |
|
"kl": 0.01177978515625, |
|
"learning_rate": 6.860664508377001e-07, |
|
"loss": 0.0005, |
|
"reward": 0.2390340268611908, |
|
"reward_std": 0.22538743168115616, |
|
"rewards/cosine_scaled_reward": 0.025473197922110558, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 1273.1250305175781, |
|
"epoch": 0.2651428571428571, |
|
"grad_norm": 0.20838648080825806, |
|
"kl": 0.01004791259765625, |
|
"learning_rate": 6.83068622519821e-07, |
|
"loss": 0.0004, |
|
"reward": 0.2724638655781746, |
|
"reward_std": 0.24231833592057228, |
|
"rewards/cosine_scaled_reward": 0.0865764303598553, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 1766.0416870117188, |
|
"epoch": 0.2662857142857143, |
|
"grad_norm": 0.22196227312088013, |
|
"kl": 0.0135498046875, |
|
"learning_rate": 6.800643086250121e-07, |
|
"loss": 0.0005, |
|
"reward": 0.2252907119691372, |
|
"reward_std": 0.18449927121400833, |
|
"rewards/cosine_scaled_reward": 0.013371981680393219, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 1795.4375610351562, |
|
"epoch": 0.2674285714285714, |
|
"grad_norm": 0.22926455736160278, |
|
"kl": 0.013641357421875, |
|
"learning_rate": 6.770536555792944e-07, |
|
"loss": 0.0005, |
|
"reward": 0.18019726127386093, |
|
"reward_std": 0.1873026303946972, |
|
"rewards/cosine_scaled_reward": -0.057943904772400856, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 1416.2500610351562, |
|
"epoch": 0.26857142857142857, |
|
"grad_norm": 0.4748923182487488, |
|
"kl": 0.01238250732421875, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.0005, |
|
"reward": 0.2144976705312729, |
|
"reward_std": 0.1543455570936203, |
|
"rewards/cosine_scaled_reward": -0.017965801060199738, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 1684.5208740234375, |
|
"epoch": 0.26971428571428574, |
|
"grad_norm": 0.27250581979751587, |
|
"kl": 0.0109100341796875, |
|
"learning_rate": 6.710139192768694e-07, |
|
"loss": 0.0004, |
|
"reward": 0.15142692811787128, |
|
"reward_std": 0.123539624735713, |
|
"rewards/cosine_scaled_reward": -0.15956238843500614, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 1095.9583435058594, |
|
"epoch": 0.27085714285714285, |
|
"grad_norm": 0.23535336554050446, |
|
"kl": 0.0092010498046875, |
|
"learning_rate": 6.679851303883891e-07, |
|
"loss": 0.0004, |
|
"reward": 0.2338778730481863, |
|
"reward_std": 0.13493703678250313, |
|
"rewards/cosine_scaled_reward": -0.05256740562617779, |
|
"rewards/format_reward": 1.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 1308.5000305175781, |
|
"epoch": 0.272, |
|
"grad_norm": 0.4510970115661621, |
|
"kl": 0.013580322265625, |
|
"learning_rate": 6.649505910711058e-07, |
|
"loss": 0.0005, |
|
"reward": 0.25004918687045574, |
|
"reward_std": 0.20090295001864433, |
|
"rewards/cosine_scaled_reward": 0.025465428829193115, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 2084.7500610351562, |
|
"epoch": 0.27314285714285713, |
|
"grad_norm": 0.41042184829711914, |
|
"kl": 0.01446533203125, |
|
"learning_rate": 6.619104492241847e-07, |
|
"loss": 0.0006, |
|
"reward": 0.06340811308473349, |
|
"reward_std": 0.18164737150073051, |
|
"rewards/cosine_scaled_reward": -0.22181823663413525, |
|
"rewards/format_reward": 0.6875000223517418, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 1270.0417175292969, |
|
"epoch": 0.2742857142857143, |
|
"grad_norm": 0.2745667099952698, |
|
"kl": 0.01023101806640625, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0004, |
|
"reward": 0.3920341283082962, |
|
"reward_std": 0.14432355761528015, |
|
"rewards/cosine_scaled_reward": 0.28012172505259514, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 1518.3958740234375, |
|
"epoch": 0.2754285714285714, |
|
"grad_norm": 0.4780232310295105, |
|
"kl": 0.018524169921875, |
|
"learning_rate": 6.558139508961654e-07, |
|
"loss": 0.0007, |
|
"reward": 0.21057775150984526, |
|
"reward_std": 0.19549552723765373, |
|
"rewards/cosine_scaled_reward": -0.04673420591279864, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 1732.7500610351562, |
|
"epoch": 0.2765714285714286, |
|
"grad_norm": 0.47158530354499817, |
|
"kl": 0.0208740234375, |
|
"learning_rate": 6.527578915497951e-07, |
|
"loss": 0.0008, |
|
"reward": 0.1667325645685196, |
|
"reward_std": 0.24624676629900932, |
|
"rewards/cosine_scaled_reward": -0.09344856068491936, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 1035.7083587646484, |
|
"epoch": 0.2777142857142857, |
|
"grad_norm": 0.22557544708251953, |
|
"kl": 0.00783538818359375, |
|
"learning_rate": 6.496968239287603e-07, |
|
"loss": 0.0003, |
|
"reward": 0.22993716970086098, |
|
"reward_std": 0.08992603048682213, |
|
"rewards/cosine_scaled_reward": -0.05973348394036293, |
|
"rewards/format_reward": 1.0, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 1274.2708740234375, |
|
"epoch": 0.27885714285714286, |
|
"grad_norm": 0.24874719977378845, |
|
"kl": 0.00920867919921875, |
|
"learning_rate": 6.466308972251785e-07, |
|
"loss": 0.0004, |
|
"reward": 0.2895762138068676, |
|
"reward_std": 0.237772386521101, |
|
"rewards/cosine_scaled_reward": 0.10733084753155708, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 1134.4167175292969, |
|
"epoch": 0.28, |
|
"grad_norm": 0.29308614134788513, |
|
"kl": 0.00977325439453125, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": 0.0004, |
|
"reward": 0.2019298244267702, |
|
"reward_std": 0.14933411590754986, |
|
"rewards/cosine_scaled_reward": -0.09132302179932594, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 1158.8542175292969, |
|
"epoch": 0.28114285714285714, |
|
"grad_norm": 0.3501374423503876, |
|
"kl": 0.012237548828125, |
|
"learning_rate": 6.404850645156841e-07, |
|
"loss": 0.0005, |
|
"reward": 0.25629024021327496, |
|
"reward_std": 0.132151298224926, |
|
"rewards/cosine_scaled_reward": 0.014732152223587036, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 1158.7292022705078, |
|
"epoch": 0.2822857142857143, |
|
"grad_norm": 0.3884400427341461, |
|
"kl": 0.0135498046875, |
|
"learning_rate": 6.374054580489873e-07, |
|
"loss": 0.0005, |
|
"reward": 0.3844507783651352, |
|
"reward_std": 0.1857384592294693, |
|
"rewards/cosine_scaled_reward": 0.250643078237772, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 1636.3750610351562, |
|
"epoch": 0.2834285714285714, |
|
"grad_norm": 0.4415358304977417, |
|
"kl": 0.01995849609375, |
|
"learning_rate": 6.343215915635761e-07, |
|
"loss": 0.0008, |
|
"reward": 0.15714343590661883, |
|
"reward_std": 0.15885592438280582, |
|
"rewards/cosine_scaled_reward": -0.10814489889889956, |
|
"rewards/format_reward": 0.8125, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 1972.7084045410156, |
|
"epoch": 0.2845714285714286, |
|
"grad_norm": 0.35937413573265076, |
|
"kl": 0.0176849365234375, |
|
"learning_rate": 6.31233615362752e-07, |
|
"loss": 0.0007, |
|
"reward": 0.16410245560109615, |
|
"reward_std": 0.23904583044350147, |
|
"rewards/cosine_scaled_reward": -0.05793091654777527, |
|
"rewards/format_reward": 0.7500000111758709, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 1539.4375457763672, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.26806238293647766, |
|
"kl": 0.02465057373046875, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.001, |
|
"reward": 0.25902947783470154, |
|
"reward_std": 0.24061259999871254, |
|
"rewards/cosine_scaled_reward": 0.08197902701795101, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 2087.291717529297, |
|
"epoch": 0.28685714285714287, |
|
"grad_norm": 0.42723548412323, |
|
"kl": 0.031829833984375, |
|
"learning_rate": 6.25045936022246e-07, |
|
"loss": 0.0013, |
|
"reward": 0.0892067551612854, |
|
"reward_std": 0.2596677578985691, |
|
"rewards/cosine_scaled_reward": -0.06505941599607468, |
|
"rewards/format_reward": 0.479166679084301, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 1393.5000305175781, |
|
"epoch": 0.288, |
|
"grad_norm": 0.4383827745914459, |
|
"kl": 0.01575469970703125, |
|
"learning_rate": 6.219465344613258e-07, |
|
"loss": 0.0006, |
|
"reward": 0.27515115961432457, |
|
"reward_std": 0.14459671452641487, |
|
"rewards/cosine_scaled_reward": 0.06821848452091217, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 1605.9792175292969, |
|
"epoch": 0.28914285714285715, |
|
"grad_norm": 0.8857656121253967, |
|
"kl": 0.0370941162109375, |
|
"learning_rate": 6.188436263278172e-07, |
|
"loss": 0.0015, |
|
"reward": 0.17214620485901833, |
|
"reward_std": 0.18065106682479382, |
|
"rewards/cosine_scaled_reward": -0.007656781002879143, |
|
"rewards/format_reward": 0.666666679084301, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 1118.2708740234375, |
|
"epoch": 0.29028571428571426, |
|
"grad_norm": 0.39530983567237854, |
|
"kl": 0.018463134765625, |
|
"learning_rate": 6.157373628530852e-07, |
|
"loss": 0.0007, |
|
"reward": 0.30500828090589494, |
|
"reward_std": 0.14383957721292973, |
|
"rewards/cosine_scaled_reward": 0.20514516159892082, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 1547.8958587646484, |
|
"epoch": 0.2914285714285714, |
|
"grad_norm": 0.40311965346336365, |
|
"kl": 0.0308685302734375, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.0012, |
|
"reward": 0.232159405015409, |
|
"reward_std": 0.16861450299620628, |
|
"rewards/cosine_scaled_reward": 0.06933547928929329, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 1279.7083435058594, |
|
"epoch": 0.2925714285714286, |
|
"grad_norm": 1.5129071474075317, |
|
"kl": 0.02239990234375, |
|
"learning_rate": 6.095153756157051e-07, |
|
"loss": 0.0009, |
|
"reward": 0.226328669115901, |
|
"reward_std": 0.173635708168149, |
|
"rewards/cosine_scaled_reward": -0.02187468856573105, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 1523.9375305175781, |
|
"epoch": 0.2937142857142857, |
|
"grad_norm": 0.38841739296913147, |
|
"kl": 0.0191497802734375, |
|
"learning_rate": 6.06399955103937e-07, |
|
"loss": 0.0008, |
|
"reward": 0.14000426977872849, |
|
"reward_std": 0.14408636838197708, |
|
"rewards/cosine_scaled_reward": -0.1849204022437334, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 1983.7084045410156, |
|
"epoch": 0.2948571428571429, |
|
"grad_norm": 0.3051685690879822, |
|
"kl": 0.029571533203125, |
|
"learning_rate": 6.032817857379256e-07, |
|
"loss": 0.0012, |
|
"reward": 0.22874920442700386, |
|
"reward_std": 0.22693637385964394, |
|
"rewards/cosine_scaled_reward": 0.04347135126590729, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 1227.4791870117188, |
|
"epoch": 0.296, |
|
"grad_norm": 0.3898649215698242, |
|
"kl": 0.02254486083984375, |
|
"learning_rate": 6.001610194928464e-07, |
|
"loss": 0.0009, |
|
"reward": 0.19578362628817558, |
|
"reward_std": 0.17094986885786057, |
|
"rewards/cosine_scaled_reward": -0.07543798349797726, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 2015.0208740234375, |
|
"epoch": 0.29714285714285715, |
|
"grad_norm": 0.3966330885887146, |
|
"kl": 0.04931640625, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.002, |
|
"reward": 0.059827481396496296, |
|
"reward_std": 0.1531398855149746, |
|
"rewards/cosine_scaled_reward": -0.17863771319389343, |
|
"rewards/format_reward": 0.583333358168602, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 1767.354248046875, |
|
"epoch": 0.29828571428571427, |
|
"grad_norm": 0.4153224527835846, |
|
"kl": 0.0479583740234375, |
|
"learning_rate": 5.939123048916173e-07, |
|
"loss": 0.0019, |
|
"reward": 0.11496858485043049, |
|
"reward_std": 0.1553056426346302, |
|
"rewards/cosine_scaled_reward": -0.18099842593073845, |
|
"rewards/format_reward": 0.8125, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 1918.3958740234375, |
|
"epoch": 0.29942857142857143, |
|
"grad_norm": 0.7899470925331116, |
|
"kl": 0.0538177490234375, |
|
"learning_rate": 5.907846610890011e-07, |
|
"loss": 0.0022, |
|
"reward": 0.0993692995980382, |
|
"reward_std": 0.13485800474882126, |
|
"rewards/cosine_scaled_reward": -0.17313212295994163, |
|
"rewards/format_reward": 0.7291666716337204, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 1160.1250457763672, |
|
"epoch": 0.30057142857142854, |
|
"grad_norm": 0.463293194770813, |
|
"kl": 0.032501220703125, |
|
"learning_rate": 5.87655029499542e-07, |
|
"loss": 0.0013, |
|
"reward": 0.1619243435561657, |
|
"reward_std": 0.18417713977396488, |
|
"rewards/cosine_scaled_reward": -0.16348575986921787, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 1765.7083740234375, |
|
"epoch": 0.3017142857142857, |
|
"grad_norm": 0.4353474974632263, |
|
"kl": 0.05523681640625, |
|
"learning_rate": 5.845235626570683e-07, |
|
"loss": 0.0022, |
|
"reward": 0.22621536999940872, |
|
"reward_std": 0.20676996186375618, |
|
"rewards/cosine_scaled_reward": -0.006626792252063751, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 2156.8750915527344, |
|
"epoch": 0.3028571428571429, |
|
"grad_norm": 0.4260510206222534, |
|
"kl": 0.06158447265625, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.0025, |
|
"reward": 0.1871240846812725, |
|
"reward_std": 0.14801884070038795, |
|
"rewards/cosine_scaled_reward": 0.006121315062046051, |
|
"rewards/format_reward": 0.7083333395421505, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 2359.229248046875, |
|
"epoch": 0.304, |
|
"grad_norm": 0.6653827428817749, |
|
"kl": 0.1218414306640625, |
|
"learning_rate": 5.78255733788191e-07, |
|
"loss": 0.0049, |
|
"reward": 0.05368301854468882, |
|
"reward_std": 0.1643918640911579, |
|
"rewards/cosine_scaled_reward": -0.15773088112473488, |
|
"rewards/format_reward": 0.5208333544433117, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 2027.0833740234375, |
|
"epoch": 0.30514285714285716, |
|
"grad_norm": 0.3705967962741852, |
|
"kl": 0.06707763671875, |
|
"learning_rate": 5.751196772469237e-07, |
|
"loss": 0.0027, |
|
"reward": 0.18167724087834358, |
|
"reward_std": 0.1626145876944065, |
|
"rewards/cosine_scaled_reward": -0.0044740717858076096, |
|
"rewards/format_reward": 0.708333358168602, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 1997.5209045410156, |
|
"epoch": 0.3062857142857143, |
|
"grad_norm": 0.6208778619766235, |
|
"kl": 0.051788330078125, |
|
"learning_rate": 5.71982396408026e-07, |
|
"loss": 0.0021, |
|
"reward": 0.13248581159859896, |
|
"reward_std": 0.18944942951202393, |
|
"rewards/cosine_scaled_reward": -0.1127938311547041, |
|
"rewards/format_reward": 0.7291666716337204, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 1530.0208740234375, |
|
"epoch": 0.30742857142857144, |
|
"grad_norm": 0.7959035038948059, |
|
"kl": 0.055450439453125, |
|
"learning_rate": 5.688440441781398e-07, |
|
"loss": 0.0022, |
|
"reward": 0.16591855697333813, |
|
"reward_std": 0.16789162531495094, |
|
"rewards/cosine_scaled_reward": -0.0885553527623415, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 985.6042022705078, |
|
"epoch": 0.30857142857142855, |
|
"grad_norm": 0.32174134254455566, |
|
"kl": 0.01244354248046875, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0005, |
|
"reward": 0.22263910062611103, |
|
"reward_std": 0.1655977163463831, |
|
"rewards/cosine_scaled_reward": -0.062186723574995995, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 1733.0834045410156, |
|
"epoch": 0.3097142857142857, |
|
"grad_norm": 0.4459218680858612, |
|
"kl": 0.060455322265625, |
|
"learning_rate": 5.625647374256061e-07, |
|
"loss": 0.0024, |
|
"reward": 0.204132997430861, |
|
"reward_std": 0.1648325566202402, |
|
"rewards/cosine_scaled_reward": -0.0029181139543652534, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 2305.7709045410156, |
|
"epoch": 0.31085714285714283, |
|
"grad_norm": 1.647191047668457, |
|
"kl": 0.1053466796875, |
|
"learning_rate": 5.594240889475106e-07, |
|
"loss": 0.0042, |
|
"reward": 0.09154188225511461, |
|
"reward_std": 0.14747749641537666, |
|
"rewards/cosine_scaled_reward": -0.12416816502809525, |
|
"rewards/format_reward": 0.604166679084301, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 2009.3333740234375, |
|
"epoch": 0.312, |
|
"grad_norm": 0.4497400224208832, |
|
"kl": 0.12188720703125, |
|
"learning_rate": 5.562829811526154e-07, |
|
"loss": 0.0049, |
|
"reward": 0.09924113377928734, |
|
"reward_std": 0.1725939903408289, |
|
"rewards/cosine_scaled_reward": -0.1677750125527382, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 1482.4167175292969, |
|
"epoch": 0.31314285714285717, |
|
"grad_norm": 0.45461875200271606, |
|
"kl": 0.025146484375, |
|
"learning_rate": 5.531415671340826e-07, |
|
"loss": 0.001, |
|
"reward": 0.1875467412173748, |
|
"reward_std": 0.15722386725246906, |
|
"rewards/cosine_scaled_reward": -0.08861712459474802, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 2088.229248046875, |
|
"epoch": 0.3142857142857143, |
|
"grad_norm": 0.8553512692451477, |
|
"kl": 0.09869384765625, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0039, |
|
"reward": 0.08563580922782421, |
|
"reward_std": 0.21715955808758736, |
|
"rewards/cosine_scaled_reward": -0.15975168626755476, |
|
"rewards/format_reward": 0.6458333507180214, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 1473.0417022705078, |
|
"epoch": 0.31542857142857145, |
|
"grad_norm": 0.6246925592422485, |
|
"kl": 0.06494140625, |
|
"learning_rate": 5.468584328659172e-07, |
|
"loss": 0.0026, |
|
"reward": 0.2704322747886181, |
|
"reward_std": 0.17775586992502213, |
|
"rewards/cosine_scaled_reward": 0.07806644402444363, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 1408.9583740234375, |
|
"epoch": 0.31657142857142856, |
|
"grad_norm": 0.7783192992210388, |
|
"kl": 0.05496978759765625, |
|
"learning_rate": 5.437170188473847e-07, |
|
"loss": 0.0022, |
|
"reward": 0.20983506552875042, |
|
"reward_std": 0.19946245104074478, |
|
"rewards/cosine_scaled_reward": -0.04521218314766884, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 1812.8959045410156, |
|
"epoch": 0.3177142857142857, |
|
"grad_norm": 0.9162847995758057, |
|
"kl": 0.1117401123046875, |
|
"learning_rate": 5.405759110524894e-07, |
|
"loss": 0.0045, |
|
"reward": 0.1658716592937708, |
|
"reward_std": 0.20760459825396538, |
|
"rewards/cosine_scaled_reward": -0.04410050390288234, |
|
"rewards/format_reward": 0.729166679084301, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 1827.1666870117188, |
|
"epoch": 0.31885714285714284, |
|
"grad_norm": 1.2598878145217896, |
|
"kl": 0.12068939208984375, |
|
"learning_rate": 5.37435262574394e-07, |
|
"loss": 0.0048, |
|
"reward": 0.1136073712259531, |
|
"reward_std": 0.2031346820294857, |
|
"rewards/cosine_scaled_reward": -0.1427288819104433, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 1207.6041717529297, |
|
"epoch": 0.32, |
|
"grad_norm": 0.3006845712661743, |
|
"kl": 0.0462493896484375, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.0018, |
|
"reward": 0.25188567116856575, |
|
"reward_std": 0.16300074756145477, |
|
"rewards/cosine_scaled_reward": -0.016072510741651058, |
|
"rewards/format_reward": 1.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 1508.6667175292969, |
|
"epoch": 0.3211428571428571, |
|
"grad_norm": 1.2887221574783325, |
|
"kl": 0.072967529296875, |
|
"learning_rate": 5.311559558218603e-07, |
|
"loss": 0.0029, |
|
"reward": 0.19021394243463874, |
|
"reward_std": 0.19989290460944176, |
|
"rewards/cosine_scaled_reward": -0.0689934715628624, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 1973.854248046875, |
|
"epoch": 0.3222857142857143, |
|
"grad_norm": 0.9689493775367737, |
|
"kl": 0.158477783203125, |
|
"learning_rate": 5.28017603591974e-07, |
|
"loss": 0.0063, |
|
"reward": 0.13670184463262558, |
|
"reward_std": 0.21750157698988914, |
|
"rewards/cosine_scaled_reward": -0.11265072226524353, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 1349.2083740234375, |
|
"epoch": 0.32342857142857145, |
|
"grad_norm": 1.58186936378479, |
|
"kl": 0.15234375, |
|
"learning_rate": 5.248803227530763e-07, |
|
"loss": 0.0061, |
|
"reward": 0.15773484483361244, |
|
"reward_std": 0.15496913716197014, |
|
"rewards/cosine_scaled_reward": -0.15540640894323587, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 1698.4791870117188, |
|
"epoch": 0.32457142857142857, |
|
"grad_norm": 1.2330901622772217, |
|
"kl": 0.1558837890625, |
|
"learning_rate": 5.21744266211809e-07, |
|
"loss": 0.0062, |
|
"reward": 0.14318925887346268, |
|
"reward_std": 0.2059284672141075, |
|
"rewards/cosine_scaled_reward": -0.12442192807793617, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 1879.6459045410156, |
|
"epoch": 0.32571428571428573, |
|
"grad_norm": 1.837733507156372, |
|
"kl": 0.2252197265625, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": 0.009, |
|
"reward": 0.09095461945980787, |
|
"reward_std": 0.16584222950041294, |
|
"rewards/cosine_scaled_reward": -0.18948345258831978, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 1449.5000305175781, |
|
"epoch": 0.32685714285714285, |
|
"grad_norm": 0.5687366724014282, |
|
"kl": 0.106109619140625, |
|
"learning_rate": 5.154764373429315e-07, |
|
"loss": 0.0042, |
|
"reward": 0.24451042897999287, |
|
"reward_std": 0.19233474135398865, |
|
"rewards/cosine_scaled_reward": -0.009479179978370667, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 1656.9583740234375, |
|
"epoch": 0.328, |
|
"grad_norm": 1.2167413234710693, |
|
"kl": 0.271484375, |
|
"learning_rate": 5.123449705004581e-07, |
|
"loss": 0.0109, |
|
"reward": 0.12593204155564308, |
|
"reward_std": 0.16647349670529366, |
|
"rewards/cosine_scaled_reward": -0.14858145266771317, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 1323.0000305175781, |
|
"epoch": 0.3291428571428571, |
|
"grad_norm": 0.9303487539291382, |
|
"kl": 0.2335205078125, |
|
"learning_rate": 5.09215338910999e-07, |
|
"loss": 0.0093, |
|
"reward": 0.23953218385577202, |
|
"reward_std": 0.1995762512087822, |
|
"rewards/cosine_scaled_reward": 0.029527440055971965, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 1701.0000610351562, |
|
"epoch": 0.3302857142857143, |
|
"grad_norm": 2.4383161067962646, |
|
"kl": 0.27117919921875, |
|
"learning_rate": 5.060876951083828e-07, |
|
"loss": 0.0109, |
|
"reward": 0.15982208959758282, |
|
"reward_std": 0.15560205932706594, |
|
"rewards/cosine_scaled_reward": -0.07527756690979004, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 1244.2292022705078, |
|
"epoch": 0.3314285714285714, |
|
"grad_norm": 1.5626213550567627, |
|
"kl": 0.178863525390625, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.0072, |
|
"reward": 0.3573876768350601, |
|
"reward_std": 0.1983262486755848, |
|
"rewards/cosine_scaled_reward": 0.20004013180732727, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 1349.1250610351562, |
|
"epoch": 0.3325714285714286, |
|
"grad_norm": 1.2207589149475098, |
|
"kl": 0.31201171875, |
|
"learning_rate": 4.998389805071536e-07, |
|
"loss": 0.0125, |
|
"reward": 0.16633182391524315, |
|
"reward_std": 0.15154998749494553, |
|
"rewards/cosine_scaled_reward": -0.1112820515409112, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 1625.3542022705078, |
|
"epoch": 0.33371428571428574, |
|
"grad_norm": 1.6357200145721436, |
|
"kl": 0.2861785888671875, |
|
"learning_rate": 4.967182142620745e-07, |
|
"loss": 0.0114, |
|
"reward": 0.12437815871089697, |
|
"reward_std": 0.20795376785099506, |
|
"rewards/cosine_scaled_reward": -0.14810878783464432, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 1631.4167175292969, |
|
"epoch": 0.33485714285714285, |
|
"grad_norm": 1.7008156776428223, |
|
"kl": 0.48681640625, |
|
"learning_rate": 4.93600044896063e-07, |
|
"loss": 0.0195, |
|
"reward": 0.12261722923722118, |
|
"reward_std": 0.17399809882044792, |
|
"rewards/cosine_scaled_reward": -0.1478295437991619, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 1580.5625305175781, |
|
"epoch": 0.336, |
|
"grad_norm": 1.610378623008728, |
|
"kl": 0.1676025390625, |
|
"learning_rate": 4.904846243842949e-07, |
|
"loss": 0.0067, |
|
"reward": 0.28274150006473064, |
|
"reward_std": 0.1986326277256012, |
|
"rewards/cosine_scaled_reward": 0.10465374775230885, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 2265.6458740234375, |
|
"epoch": 0.33714285714285713, |
|
"grad_norm": 2.0899946689605713, |
|
"kl": 0.60400390625, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0241, |
|
"reward": 0.12508661299943924, |
|
"reward_std": 0.15057932399213314, |
|
"rewards/cosine_scaled_reward": -0.04951752349734306, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 1289.0416870117188, |
|
"epoch": 0.3382857142857143, |
|
"grad_norm": 1.59931218624115, |
|
"kl": 0.325592041015625, |
|
"learning_rate": 4.842626371469149e-07, |
|
"loss": 0.013, |
|
"reward": 0.3012130483984947, |
|
"reward_std": 0.27314068377017975, |
|
"rewards/cosine_scaled_reward": 0.14410861767828465, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 1420.2708892822266, |
|
"epoch": 0.3394285714285714, |
|
"grad_norm": 2.525320053100586, |
|
"kl": 0.368133544921875, |
|
"learning_rate": 4.811563736721829e-07, |
|
"loss": 0.0147, |
|
"reward": 0.12896058335900307, |
|
"reward_std": 0.15636470913887024, |
|
"rewards/cosine_scaled_reward": -0.15718013513833284, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 1947.3750305175781, |
|
"epoch": 0.3405714285714286, |
|
"grad_norm": 2.7730467319488525, |
|
"kl": 0.7021484375, |
|
"learning_rate": 4.780534655386743e-07, |
|
"loss": 0.0281, |
|
"reward": 0.20079701766371727, |
|
"reward_std": 0.2073996216058731, |
|
"rewards/cosine_scaled_reward": -0.03276558732613921, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 1455.3958740234375, |
|
"epoch": 0.3417142857142857, |
|
"grad_norm": 2.607783317565918, |
|
"kl": 0.4853515625, |
|
"learning_rate": 4.749540639777539e-07, |
|
"loss": 0.0194, |
|
"reward": 0.12638170272111893, |
|
"reward_std": 0.17803113162517548, |
|
"rewards/cosine_scaled_reward": -0.17176830675452948, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 1603.4166870117188, |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 3.5268094539642334, |
|
"kl": 0.693359375, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0277, |
|
"reward": 0.2142921146005392, |
|
"reward_std": 0.14737887866795063, |
|
"rewards/cosine_scaled_reward": 0.022697463631629944, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 1849.9791870117188, |
|
"epoch": 0.344, |
|
"grad_norm": 195.25047302246094, |
|
"kl": 6.3701171875, |
|
"learning_rate": 4.68766384637248e-07, |
|
"loss": 0.2556, |
|
"reward": 0.22336112707853317, |
|
"reward_std": 0.2203882373869419, |
|
"rewards/cosine_scaled_reward": 0.050891561433672905, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 1800.541748046875, |
|
"epoch": 0.34514285714285714, |
|
"grad_norm": 2.098578453063965, |
|
"kl": 0.9892578125, |
|
"learning_rate": 4.656784084364238e-07, |
|
"loss": 0.0396, |
|
"reward": 0.14433493767865002, |
|
"reward_std": 0.15274815633893013, |
|
"rewards/cosine_scaled_reward": -0.090041883289814, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 1295.1042175292969, |
|
"epoch": 0.3462857142857143, |
|
"grad_norm": 1.0480232238769531, |
|
"kl": 0.491546630859375, |
|
"learning_rate": 4.6259454195101267e-07, |
|
"loss": 0.0197, |
|
"reward": 0.20194483920931816, |
|
"reward_std": 0.13899757340550423, |
|
"rewards/cosine_scaled_reward": -0.060742251574993134, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 1140.8958587646484, |
|
"epoch": 0.3474285714285714, |
|
"grad_norm": 2.1697607040405273, |
|
"kl": 0.461669921875, |
|
"learning_rate": 4.59514935484316e-07, |
|
"loss": 0.0184, |
|
"reward": 0.3258020356297493, |
|
"reward_std": 0.24504756554961205, |
|
"rewards/cosine_scaled_reward": 0.16624495573341846, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 1822.1458435058594, |
|
"epoch": 0.3485714285714286, |
|
"grad_norm": 1.8082146644592285, |
|
"kl": 0.8291015625, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.0332, |
|
"reward": 0.13778295274823904, |
|
"reward_std": 0.19784759543836117, |
|
"rewards/cosine_scaled_reward": -0.10065719857811928, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 1604.3542175292969, |
|
"epoch": 0.3497142857142857, |
|
"grad_norm": 4.8597798347473145, |
|
"kl": 0.73583984375, |
|
"learning_rate": 4.5336910277482155e-07, |
|
"loss": 0.0294, |
|
"reward": 0.194507101085037, |
|
"reward_std": 0.16609442234039307, |
|
"rewards/cosine_scaled_reward": -0.0308070071041584, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 1429.8750305175781, |
|
"epoch": 0.35085714285714287, |
|
"grad_norm": 1.5004690885543823, |
|
"kl": 0.541259765625, |
|
"learning_rate": 4.503031760712397e-07, |
|
"loss": 0.0216, |
|
"reward": 0.34246205165982246, |
|
"reward_std": 0.23809099197387695, |
|
"rewards/cosine_scaled_reward": 0.23215805366635323, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 1361.8333587646484, |
|
"epoch": 0.352, |
|
"grad_norm": 2.2864489555358887, |
|
"kl": 0.517822265625, |
|
"learning_rate": 4.4724210845020494e-07, |
|
"loss": 0.0208, |
|
"reward": 0.16233128495514393, |
|
"reward_std": 0.1740443892776966, |
|
"rewards/cosine_scaled_reward": -0.05880259908735752, |
|
"rewards/format_reward": 0.7291666716337204, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 1388.812515258789, |
|
"epoch": 0.35314285714285715, |
|
"grad_norm": 1.1416103839874268, |
|
"kl": 0.419677734375, |
|
"learning_rate": 4.441860491038345e-07, |
|
"loss": 0.0168, |
|
"reward": 0.1640561018139124, |
|
"reward_std": 0.19008341804146767, |
|
"rewards/cosine_scaled_reward": -0.10478888358920813, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 1815.3333892822266, |
|
"epoch": 0.35428571428571426, |
|
"grad_norm": 1.7600479125976562, |
|
"kl": 0.5670166015625, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": 0.0227, |
|
"reward": 0.18639850337058306, |
|
"reward_std": 0.14701339416205883, |
|
"rewards/cosine_scaled_reward": -0.038788361474871635, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 1752.2500305175781, |
|
"epoch": 0.3554285714285714, |
|
"grad_norm": 1.5405431985855103, |
|
"kl": 0.7080078125, |
|
"learning_rate": 4.3808955077581546e-07, |
|
"loss": 0.0283, |
|
"reward": 0.20283151790499687, |
|
"reward_std": 0.1671408899128437, |
|
"rewards/cosine_scaled_reward": -0.00958926323801279, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 1452.8333740234375, |
|
"epoch": 0.3565714285714286, |
|
"grad_norm": 1.9203969240188599, |
|
"kl": 0.5329742431640625, |
|
"learning_rate": 4.350494089288943e-07, |
|
"loss": 0.0214, |
|
"reward": 0.1789357993984595, |
|
"reward_std": 0.1928165927529335, |
|
"rewards/cosine_scaled_reward": -0.06470100209116936, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 1656.5625610351562, |
|
"epoch": 0.3577142857142857, |
|
"grad_norm": 3.925797462463379, |
|
"kl": 0.5869140625, |
|
"learning_rate": 4.3201486961161093e-07, |
|
"loss": 0.0234, |
|
"reward": 0.15515877585858107, |
|
"reward_std": 0.16902573220431805, |
|
"rewards/cosine_scaled_reward": -0.15649997163563967, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 1387.2291870117188, |
|
"epoch": 0.3588571428571429, |
|
"grad_norm": 2.6238675117492676, |
|
"kl": 0.428863525390625, |
|
"learning_rate": 4.2898608072313045e-07, |
|
"loss": 0.0172, |
|
"reward": 0.22301865927875042, |
|
"reward_std": 0.14535253681242466, |
|
"rewards/cosine_scaled_reward": -0.030876588076353073, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 1324.2083587646484, |
|
"epoch": 0.36, |
|
"grad_norm": 1.4441876411437988, |
|
"kl": 0.6809234619140625, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0273, |
|
"reward": 0.16800063382834196, |
|
"reward_std": 0.16469407826662064, |
|
"rewards/cosine_scaled_reward": -0.09765112772583961, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 1827.1042175292969, |
|
"epoch": 0.36114285714285715, |
|
"grad_norm": 1.9154859781265259, |
|
"kl": 0.8681640625, |
|
"learning_rate": 4.2294634442070553e-07, |
|
"loss": 0.0347, |
|
"reward": 0.0611695135012269, |
|
"reward_std": 0.20097459852695465, |
|
"rewards/cosine_scaled_reward": -0.22885679081082344, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 1761.1042175292969, |
|
"epoch": 0.36228571428571427, |
|
"grad_norm": 1.2220473289489746, |
|
"kl": 0.858551025390625, |
|
"learning_rate": 4.1993569137498776e-07, |
|
"loss": 0.0343, |
|
"reward": 0.2909087585285306, |
|
"reward_std": 0.20921817421913147, |
|
"rewards/cosine_scaled_reward": 0.12270434573292732, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 1412.3333740234375, |
|
"epoch": 0.36342857142857143, |
|
"grad_norm": 1.2659395933151245, |
|
"kl": 0.451171875, |
|
"learning_rate": 4.1693137748017915e-07, |
|
"loss": 0.0181, |
|
"reward": 0.1977246394380927, |
|
"reward_std": 0.16955609619617462, |
|
"rewards/cosine_scaled_reward": -0.0062361303716897964, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 1949.3750305175781, |
|
"epoch": 0.36457142857142855, |
|
"grad_norm": 3.923489809036255, |
|
"kl": 0.63671875, |
|
"learning_rate": 4.1393354916230005e-07, |
|
"loss": 0.0255, |
|
"reward": 0.13074796926230192, |
|
"reward_std": 0.1879247985780239, |
|
"rewards/cosine_scaled_reward": -0.09139842540025711, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 1168.2292175292969, |
|
"epoch": 0.3657142857142857, |
|
"grad_norm": 1.0396791696548462, |
|
"kl": 0.1934814453125, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0077, |
|
"reward": 0.22300215438008308, |
|
"reward_std": 0.2135285884141922, |
|
"rewards/cosine_scaled_reward": -0.06148634012788534, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 1763.3333435058594, |
|
"epoch": 0.3668571428571429, |
|
"grad_norm": 1.4253339767456055, |
|
"kl": 0.74658203125, |
|
"learning_rate": 4.079579333738039e-07, |
|
"loss": 0.0299, |
|
"reward": 0.1205808836966753, |
|
"reward_std": 0.17672885209321976, |
|
"rewards/cosine_scaled_reward": -0.11066987551748753, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 1481.2917175292969, |
|
"epoch": 0.368, |
|
"grad_norm": 1.548012614250183, |
|
"kl": 0.4468994140625, |
|
"learning_rate": 4.0498043714627006e-07, |
|
"loss": 0.0179, |
|
"reward": 0.2291913628578186, |
|
"reward_std": 0.14006879553198814, |
|
"rewards/cosine_scaled_reward": 0.01431015320122242, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 1383.9583740234375, |
|
"epoch": 0.36914285714285716, |
|
"grad_norm": 1.8068033456802368, |
|
"kl": 0.530517578125, |
|
"learning_rate": 4.020100089676376e-07, |
|
"loss": 0.0212, |
|
"reward": 0.27303827553987503, |
|
"reward_std": 0.17342529818415642, |
|
"rewards/cosine_scaled_reward": 0.07235794328153133, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 1631.8333740234375, |
|
"epoch": 0.3702857142857143, |
|
"grad_norm": 3.112732410430908, |
|
"kl": 0.5965576171875, |
|
"learning_rate": 3.9904679361238526e-07, |
|
"loss": 0.0238, |
|
"reward": 0.11710180155932903, |
|
"reward_std": 0.14361269772052765, |
|
"rewards/cosine_scaled_reward": -0.21200886741280556, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 967.8541717529297, |
|
"epoch": 0.37142857142857144, |
|
"grad_norm": 1.7522660493850708, |
|
"kl": 0.22650146484375, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0091, |
|
"reward": 0.2394350841641426, |
|
"reward_std": 0.1827605739235878, |
|
"rewards/cosine_scaled_reward": 0.019153601489961147, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 1521.2708587646484, |
|
"epoch": 0.37257142857142855, |
|
"grad_norm": 3.303557872772217, |
|
"kl": 0.71337890625, |
|
"learning_rate": 3.931425787051832e-07, |
|
"loss": 0.0285, |
|
"reward": 0.16831985116004944, |
|
"reward_std": 0.16597898304462433, |
|
"rewards/cosine_scaled_reward": -0.10751725360751152, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 1820.0625610351562, |
|
"epoch": 0.3737142857142857, |
|
"grad_norm": 2.2156002521514893, |
|
"kl": 1.386962890625, |
|
"learning_rate": 3.902018669163384e-07, |
|
"loss": 0.0555, |
|
"reward": 0.1753358980640769, |
|
"reward_std": 0.13772335462272167, |
|
"rewards/cosine_scaled_reward": -0.009533978998661041, |
|
"rewards/format_reward": 0.6875, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 1580.7708740234375, |
|
"epoch": 0.37485714285714283, |
|
"grad_norm": 2.2257449626922607, |
|
"kl": 0.70751953125, |
|
"learning_rate": 3.872689434630585e-07, |
|
"loss": 0.0283, |
|
"reward": 0.20990237966179848, |
|
"reward_std": 0.18772607296705246, |
|
"rewards/cosine_scaled_reward": 0.03089301474392414, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 1922.5209045410156, |
|
"epoch": 0.376, |
|
"grad_norm": 2.8192689418792725, |
|
"kl": 1.236328125, |
|
"learning_rate": 3.843439512918949e-07, |
|
"loss": 0.0494, |
|
"reward": 0.1401547589339316, |
|
"reward_std": 0.18911270424723625, |
|
"rewards/cosine_scaled_reward": -0.07860468700528145, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 2099.1875610351562, |
|
"epoch": 0.37714285714285717, |
|
"grad_norm": 2.138005018234253, |
|
"kl": 1.3330078125, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0532, |
|
"reward": 0.13079076260328293, |
|
"reward_std": 0.25324463099241257, |
|
"rewards/cosine_scaled_reward": -0.06187394913285971, |
|
"rewards/format_reward": 0.6250000298023224, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 1694.0416870117188, |
|
"epoch": 0.3782857142857143, |
|
"grad_norm": 4.427945613861084, |
|
"kl": 1.271484375, |
|
"learning_rate": 3.785183306423767e-07, |
|
"loss": 0.0509, |
|
"reward": 0.06919177388772368, |
|
"reward_std": 0.14648743718862534, |
|
"rewards/cosine_scaled_reward": -0.2538422755897045, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 1520.4166870117188, |
|
"epoch": 0.37942857142857145, |
|
"grad_norm": 2.170893669128418, |
|
"kl": 1.00927734375, |
|
"learning_rate": 3.7561798609655373e-07, |
|
"loss": 0.0404, |
|
"reward": 0.2010207176208496, |
|
"reward_std": 0.22100840136408806, |
|
"rewards/cosine_scaled_reward": -0.01634824648499489, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 1651.2083435058594, |
|
"epoch": 0.38057142857142856, |
|
"grad_norm": 3.6611242294311523, |
|
"kl": 1.2607421875, |
|
"learning_rate": 3.72726140684072e-07, |
|
"loss": 0.0505, |
|
"reward": 0.09817091876175255, |
|
"reward_std": 0.15193179063498974, |
|
"rewards/cosine_scaled_reward": -0.21044551581144333, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 1986.229248046875, |
|
"epoch": 0.38171428571428573, |
|
"grad_norm": 3.5136094093322754, |
|
"kl": 1.3896484375, |
|
"learning_rate": 3.6984293534939737e-07, |
|
"loss": 0.0556, |
|
"reward": 0.17547175474464893, |
|
"reward_std": 0.2079135961830616, |
|
"rewards/cosine_scaled_reward": -0.031742025166749954, |
|
"rewards/format_reward": 0.7291667014360428, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 1639.3750610351562, |
|
"epoch": 0.38285714285714284, |
|
"grad_norm": 4.113811016082764, |
|
"kl": 1.249267578125, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.0501, |
|
"reward": 0.14984197542071342, |
|
"reward_std": 0.16426498722285032, |
|
"rewards/cosine_scaled_reward": -0.14330823719501495, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 1604.9583740234375, |
|
"epoch": 0.384, |
|
"grad_norm": 2.564976215362549, |
|
"kl": 0.9404296875, |
|
"learning_rate": 3.641030065789562e-07, |
|
"loss": 0.0376, |
|
"reward": 0.08363019116222858, |
|
"reward_std": 0.13052179291844368, |
|
"rewards/cosine_scaled_reward": -0.2588311657309532, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 1238.4583740234375, |
|
"epoch": 0.3851428571428571, |
|
"grad_norm": 2.2139716148376465, |
|
"kl": 0.671295166015625, |
|
"learning_rate": 3.612465628992203e-07, |
|
"loss": 0.0269, |
|
"reward": 0.17029657028615475, |
|
"reward_std": 0.23339306563138962, |
|
"rewards/cosine_scaled_reward": -0.097343516536057, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 2030.3541870117188, |
|
"epoch": 0.3862857142857143, |
|
"grad_norm": 2.1332132816314697, |
|
"kl": 1.28515625, |
|
"learning_rate": 3.5839931879571725e-07, |
|
"loss": 0.0514, |
|
"reward": 0.0918416610584245, |
|
"reward_std": 0.19109328091144562, |
|
"rewards/cosine_scaled_reward": -0.16856661438941956, |
|
"rewards/format_reward": 0.6875000074505806, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 2271.2500610351562, |
|
"epoch": 0.38742857142857146, |
|
"grad_norm": 1.892738938331604, |
|
"kl": 1.158203125, |
|
"learning_rate": 3.555614130391079e-07, |
|
"loss": 0.0462, |
|
"reward": 0.08900500182062387, |
|
"reward_std": 0.21380429714918137, |
|
"rewards/cosine_scaled_reward": -0.12333061918616295, |
|
"rewards/format_reward": 0.5833333656191826, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 1519.2084045410156, |
|
"epoch": 0.38857142857142857, |
|
"grad_norm": 2.038071870803833, |
|
"kl": 0.5703125, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": 0.0228, |
|
"reward": 0.24512270092964172, |
|
"reward_std": 0.18805953487753868, |
|
"rewards/cosine_scaled_reward": 0.04649870842695236, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 1584.0417175292969, |
|
"epoch": 0.38971428571428574, |
|
"grad_norm": 2.7954156398773193, |
|
"kl": 0.666015625, |
|
"learning_rate": 3.4991416936678276e-07, |
|
"loss": 0.0267, |
|
"reward": 0.2796506742015481, |
|
"reward_std": 0.18307143822312355, |
|
"rewards/cosine_scaled_reward": 0.09590147994458675, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 1688.1042175292969, |
|
"epoch": 0.39085714285714285, |
|
"grad_norm": 2.481614589691162, |
|
"kl": 0.646728515625, |
|
"learning_rate": 3.471051066897562e-07, |
|
"loss": 0.0259, |
|
"reward": 0.13232821132987738, |
|
"reward_std": 0.15718812122941017, |
|
"rewards/cosine_scaled_reward": -0.10951092094182968, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 2052.0625915527344, |
|
"epoch": 0.392, |
|
"grad_norm": 1.4259511232376099, |
|
"kl": 1.07421875, |
|
"learning_rate": 3.4430593282358777e-07, |
|
"loss": 0.0429, |
|
"reward": 0.1814631875604391, |
|
"reward_std": 0.22088013961911201, |
|
"rewards/cosine_scaled_reward": -0.04379495978355408, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 1663.125, |
|
"epoch": 0.3931428571428571, |
|
"grad_norm": 1.6074610948562622, |
|
"kl": 0.69775390625, |
|
"learning_rate": 3.4151678419606233e-07, |
|
"loss": 0.0279, |
|
"reward": 0.21839727461338043, |
|
"reward_std": 0.15816613100469112, |
|
"rewards/cosine_scaled_reward": 0.029275711625814438, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 1546.8333587646484, |
|
"epoch": 0.3942857142857143, |
|
"grad_norm": 1.4356237649917603, |
|
"kl": 0.76837158203125, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.0308, |
|
"reward": 0.2311046477407217, |
|
"reward_std": 0.16006076335906982, |
|
"rewards/cosine_scaled_reward": 0.007090110331773758, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 2192.6459350585938, |
|
"epoch": 0.3954285714285714, |
|
"grad_norm": 3.3563661575317383, |
|
"kl": 1.0556640625, |
|
"learning_rate": 3.359691059183761e-07, |
|
"loss": 0.0422, |
|
"reward": 0.08254441898316145, |
|
"reward_std": 0.21427064761519432, |
|
"rewards/cosine_scaled_reward": -0.1869775615632534, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 1711.5834350585938, |
|
"epoch": 0.3965714285714286, |
|
"grad_norm": 2.4092659950256348, |
|
"kl": 0.818359375, |
|
"learning_rate": 3.3321084665422803e-07, |
|
"loss": 0.0328, |
|
"reward": 0.3048556298017502, |
|
"reward_std": 0.24222856387495995, |
|
"rewards/cosine_scaled_reward": 0.14202131098136306, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 1510.9583435058594, |
|
"epoch": 0.3977142857142857, |
|
"grad_norm": 2.6288950443267822, |
|
"kl": 0.750732421875, |
|
"learning_rate": 3.3046315338757026e-07, |
|
"loss": 0.03, |
|
"reward": 0.18389190919697285, |
|
"reward_std": 0.22533446922898293, |
|
"rewards/cosine_scaled_reward": -0.06304232217371464, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 1343.7083587646484, |
|
"epoch": 0.39885714285714285, |
|
"grad_norm": 1.5651475191116333, |
|
"kl": 0.58001708984375, |
|
"learning_rate": 3.2772616003709616e-07, |
|
"loss": 0.0232, |
|
"reward": 0.3366018421947956, |
|
"reward_std": 0.2321232110261917, |
|
"rewards/cosine_scaled_reward": 0.21495839580893517, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 1627.8750610351562, |
|
"epoch": 0.4, |
|
"grad_norm": 1.5066512823104858, |
|
"kl": 0.85693359375, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0343, |
|
"reward": 0.27388138696551323, |
|
"reward_std": 0.2236754074692726, |
|
"rewards/cosine_scaled_reward": 0.14884734898805618, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 1225.2916870117188, |
|
"epoch": 0.40114285714285713, |
|
"grad_norm": 1.9423860311508179, |
|
"kl": 0.312164306640625, |
|
"learning_rate": 3.222848061454764e-07, |
|
"loss": 0.0124, |
|
"reward": 0.1787915969034657, |
|
"reward_std": 0.138715498149395, |
|
"rewards/cosine_scaled_reward": -0.0604798283893615, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 1779.354248046875, |
|
"epoch": 0.4022857142857143, |
|
"grad_norm": 2.395528793334961, |
|
"kl": 1.109375, |
|
"learning_rate": 3.195807108082429e-07, |
|
"loss": 0.0444, |
|
"reward": 0.11777728889137506, |
|
"reward_std": 0.20832915045320988, |
|
"rewards/cosine_scaled_reward": -0.16191274672746658, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 1802.1875305175781, |
|
"epoch": 0.4034285714285714, |
|
"grad_norm": 3.2702293395996094, |
|
"kl": 0.888671875, |
|
"learning_rate": 3.168878457820915e-07, |
|
"loss": 0.0355, |
|
"reward": 0.14973015896975994, |
|
"reward_std": 0.2422914244234562, |
|
"rewards/cosine_scaled_reward": -0.08677250519394875, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 1341.9166870117188, |
|
"epoch": 0.4045714285714286, |
|
"grad_norm": 2.3067736625671387, |
|
"kl": 0.63531494140625, |
|
"learning_rate": 3.142063423134644e-07, |
|
"loss": 0.0254, |
|
"reward": 0.08647960424423218, |
|
"reward_std": 0.14325924962759018, |
|
"rewards/cosine_scaled_reward": -0.2389773204922676, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 1209.0000305175781, |
|
"epoch": 0.4057142857142857, |
|
"grad_norm": 2.9223105907440186, |
|
"kl": 0.60986328125, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.0244, |
|
"reward": 0.21037685312330723, |
|
"reward_std": 0.15928080305457115, |
|
"rewards/cosine_scaled_reward": -0.018377395812422037, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 1625.2292175292969, |
|
"epoch": 0.40685714285714286, |
|
"grad_norm": 2.497197151184082, |
|
"kl": 1.08203125, |
|
"learning_rate": 3.0887794225945143e-07, |
|
"loss": 0.0433, |
|
"reward": 0.15635429695248604, |
|
"reward_std": 0.18079080618917942, |
|
"rewards/cosine_scaled_reward": -0.09902848303318024, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 1548.1250457763672, |
|
"epoch": 0.408, |
|
"grad_norm": 2.1372578144073486, |
|
"kl": 1.04052734375, |
|
"learning_rate": 3.062313053727671e-07, |
|
"loss": 0.0416, |
|
"reward": 0.1531627606600523, |
|
"reward_std": 0.13187414780259132, |
|
"rewards/cosine_scaled_reward": -0.15889397263526917, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 1938.7500915527344, |
|
"epoch": 0.40914285714285714, |
|
"grad_norm": 3.2893924713134766, |
|
"kl": 1.296875, |
|
"learning_rate": 3.0359654942835247e-07, |
|
"loss": 0.052, |
|
"reward": 0.23603218793869019, |
|
"reward_std": 0.2104952149093151, |
|
"rewards/cosine_scaled_reward": 0.06818200647830963, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 1526.7292175292969, |
|
"epoch": 0.4102857142857143, |
|
"grad_norm": 1.6249175071716309, |
|
"kl": 0.94677734375, |
|
"learning_rate": 3.0097380284049523e-07, |
|
"loss": 0.038, |
|
"reward": 0.14598742313683033, |
|
"reward_std": 0.16005707904696465, |
|
"rewards/cosine_scaled_reward": -0.08985930308699608, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 1373.6459045410156, |
|
"epoch": 0.4114285714285714, |
|
"grad_norm": 2.30932354927063, |
|
"kl": 1.03515625, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0414, |
|
"reward": 0.20623124949634075, |
|
"reward_std": 0.24741016328334808, |
|
"rewards/cosine_scaled_reward": -0.009535698220133781, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 1704.8750457763672, |
|
"epoch": 0.4125714285714286, |
|
"grad_norm": 1.839920163154602, |
|
"kl": 1.0859375, |
|
"learning_rate": 2.9576484845877793e-07, |
|
"loss": 0.0434, |
|
"reward": 0.12991097196936607, |
|
"reward_std": 0.16597579792141914, |
|
"rewards/cosine_scaled_reward": -0.13096854276955128, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 1567.2083740234375, |
|
"epoch": 0.4137142857142857, |
|
"grad_norm": 1.8752156496047974, |
|
"kl": 1.0390625, |
|
"learning_rate": 2.931788945420058e-07, |
|
"loss": 0.0416, |
|
"reward": 0.1774279922246933, |
|
"reward_std": 0.20492257550358772, |
|
"rewards/cosine_scaled_reward": -0.10070006363093853, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 1532.4584350585938, |
|
"epoch": 0.41485714285714287, |
|
"grad_norm": 2.349215507507324, |
|
"kl": 1.103759765625, |
|
"learning_rate": 2.9060545772359305e-07, |
|
"loss": 0.0442, |
|
"reward": 0.12260803673416376, |
|
"reward_std": 0.21350538730621338, |
|
"rewards/cosine_scaled_reward": -0.17045626137405634, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 1452.5417175292969, |
|
"epoch": 0.416, |
|
"grad_norm": 1.401031255722046, |
|
"kl": 1.0751953125, |
|
"learning_rate": 2.8804466342921987e-07, |
|
"loss": 0.043, |
|
"reward": 0.16770456731319427, |
|
"reward_std": 0.1539650922641158, |
|
"rewards/cosine_scaled_reward": -0.10999439284205437, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 2189.7084350585938, |
|
"epoch": 0.41714285714285715, |
|
"grad_norm": 3.1197776794433594, |
|
"kl": 1.63671875, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.0654, |
|
"reward": 0.011583839543163776, |
|
"reward_std": 0.16744238138198853, |
|
"rewards/cosine_scaled_reward": -0.26333725824952126, |
|
"rewards/format_reward": 0.5625, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 1772.6666870117188, |
|
"epoch": 0.41828571428571426, |
|
"grad_norm": 1.569739818572998, |
|
"kl": 0.8743896484375, |
|
"learning_rate": 2.829615010283344e-07, |
|
"loss": 0.0349, |
|
"reward": 0.18236286379396915, |
|
"reward_std": 0.204185388982296, |
|
"rewards/cosine_scaled_reward": -0.01714538410305977, |
|
"rewards/format_reward": 0.7291667014360428, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 1263.0833740234375, |
|
"epoch": 0.41942857142857143, |
|
"grad_norm": 1.6131809949874878, |
|
"kl": 0.82684326171875, |
|
"learning_rate": 2.8043938066798645e-07, |
|
"loss": 0.0331, |
|
"reward": 0.24072659714147449, |
|
"reward_std": 0.16533087193965912, |
|
"rewards/cosine_scaled_reward": 0.038546825759112835, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 1519.979232788086, |
|
"epoch": 0.4205714285714286, |
|
"grad_norm": 3.1444971561431885, |
|
"kl": 0.759765625, |
|
"learning_rate": 2.7793039831193133e-07, |
|
"loss": 0.0304, |
|
"reward": 0.20653121452778578, |
|
"reward_std": 0.18122152984142303, |
|
"rewards/cosine_scaled_reward": -0.00040038255974650383, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 1517.1875305175781, |
|
"epoch": 0.4217142857142857, |
|
"grad_norm": 1.515045166015625, |
|
"kl": 0.861328125, |
|
"learning_rate": 2.7543467624442956e-07, |
|
"loss": 0.0344, |
|
"reward": 0.2802426964044571, |
|
"reward_std": 0.1838936284184456, |
|
"rewards/cosine_scaled_reward": 0.1429410008713603, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 1656.1458740234375, |
|
"epoch": 0.4228571428571429, |
|
"grad_norm": 3.1206541061401367, |
|
"kl": 0.80908203125, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.0323, |
|
"reward": 0.0936688520014286, |
|
"reward_std": 0.13261800445616245, |
|
"rewards/cosine_scaled_reward": -0.2378309927880764, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 1568.1875305175781, |
|
"epoch": 0.424, |
|
"grad_norm": 2.3558554649353027, |
|
"kl": 0.93798828125, |
|
"learning_rate": 2.7048349887476037e-07, |
|
"loss": 0.0375, |
|
"reward": 0.13434469606727362, |
|
"reward_std": 0.16923817060887814, |
|
"rewards/cosine_scaled_reward": -0.14115899708122015, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 1899.3333435058594, |
|
"epoch": 0.42514285714285716, |
|
"grad_norm": 2.514725923538208, |
|
"kl": 1.216796875, |
|
"learning_rate": 2.6802828488599294e-07, |
|
"loss": 0.0486, |
|
"reward": 0.13189218938350677, |
|
"reward_std": 0.11921382136642933, |
|
"rewards/cosine_scaled_reward": -0.14287901669740677, |
|
"rewards/format_reward": 0.7916667014360428, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 1694.166748046875, |
|
"epoch": 0.42628571428571427, |
|
"grad_norm": 2.2729876041412354, |
|
"kl": 1.076171875, |
|
"learning_rate": 2.655868138008171e-07, |
|
"loss": 0.043, |
|
"reward": 0.13227892480790615, |
|
"reward_std": 0.18314684182405472, |
|
"rewards/cosine_scaled_reward": -0.15385251492261887, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 1518.2917175292969, |
|
"epoch": 0.42742857142857144, |
|
"grad_norm": 1.470110535621643, |
|
"kl": 0.7646484375, |
|
"learning_rate": 2.631592046130896e-07, |
|
"loss": 0.0306, |
|
"reward": 0.24618086963891983, |
|
"reward_std": 0.18493768386542797, |
|
"rewards/cosine_scaled_reward": 0.02095278911292553, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 1516.7084045410156, |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 1.2037338018417358, |
|
"kl": 0.9248046875, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.037, |
|
"reward": 0.19765574857592583, |
|
"reward_std": 0.23449427634477615, |
|
"rewards/cosine_scaled_reward": -0.02087587956339121, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 1457.7083740234375, |
|
"epoch": 0.4297142857142857, |
|
"grad_norm": 2.412107229232788, |
|
"kl": 0.76611328125, |
|
"learning_rate": 2.583460445215911e-07, |
|
"loss": 0.0306, |
|
"reward": 0.2110733650624752, |
|
"reward_std": 0.24278932064771652, |
|
"rewards/cosine_scaled_reward": -0.01106532383710146, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 1775.1458740234375, |
|
"epoch": 0.4308571428571429, |
|
"grad_norm": 1.3878467082977295, |
|
"kl": 1.2080078125, |
|
"learning_rate": 2.5596072820445254e-07, |
|
"loss": 0.0483, |
|
"reward": 0.1983587248250842, |
|
"reward_std": 0.2635771445930004, |
|
"rewards/cosine_scaled_reward": -0.025547289289534092, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 1364.0625305175781, |
|
"epoch": 0.432, |
|
"grad_norm": 2.308479070663452, |
|
"kl": 0.650390625, |
|
"learning_rate": 2.5358974294659373e-07, |
|
"loss": 0.026, |
|
"reward": 0.09404418990015984, |
|
"reward_std": 0.1384107507765293, |
|
"rewards/cosine_scaled_reward": -0.2571399100124836, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 1627.7916870117188, |
|
"epoch": 0.43314285714285716, |
|
"grad_norm": 1.8004807233810425, |
|
"kl": 0.79296875, |
|
"learning_rate": 2.512332043064913e-07, |
|
"loss": 0.0317, |
|
"reward": 0.24452932178974152, |
|
"reward_std": 0.20264879241585732, |
|
"rewards/cosine_scaled_reward": 0.02817649580538273, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 1773.5208740234375, |
|
"epoch": 0.4342857142857143, |
|
"grad_norm": 2.1422038078308105, |
|
"kl": 1.15234375, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.046, |
|
"reward": 0.11856007762253284, |
|
"reward_std": 0.14229279570281506, |
|
"rewards/cosine_scaled_reward": -0.1719975546002388, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 1725.7500915527344, |
|
"epoch": 0.43542857142857144, |
|
"grad_norm": 2.1980226039886475, |
|
"kl": 0.9560546875, |
|
"learning_rate": 2.465639255873246e-07, |
|
"loss": 0.0381, |
|
"reward": 0.14323885599151254, |
|
"reward_std": 0.21406007558107376, |
|
"rewards/cosine_scaled_reward": -0.1250370437046513, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 1673.3333740234375, |
|
"epoch": 0.43657142857142855, |
|
"grad_norm": 1.6990046501159668, |
|
"kl": 1.20703125, |
|
"learning_rate": 2.4425141308231765e-07, |
|
"loss": 0.0484, |
|
"reward": 0.11549163609743118, |
|
"reward_std": 0.19104652479290962, |
|
"rewards/cosine_scaled_reward": -0.16341273672878742, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 2062.479248046875, |
|
"epoch": 0.4377142857142857, |
|
"grad_norm": 2.1265945434570312, |
|
"kl": 1.5693359375, |
|
"learning_rate": 2.4195380233209006e-07, |
|
"loss": 0.0627, |
|
"reward": 0.14214371237903833, |
|
"reward_std": 0.21174047514796257, |
|
"rewards/cosine_scaled_reward": -0.05033590830862522, |
|
"rewards/format_reward": 0.6458333730697632, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 1379.2708740234375, |
|
"epoch": 0.43885714285714283, |
|
"grad_norm": 2.062028408050537, |
|
"kl": 0.75439453125, |
|
"learning_rate": 2.3967120531894857e-07, |
|
"loss": 0.0302, |
|
"reward": 0.07300803670659661, |
|
"reward_std": 0.12315612472593784, |
|
"rewards/cosine_scaled_reward": -0.2674658801406622, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 1687.6250762939453, |
|
"epoch": 0.44, |
|
"grad_norm": 2.788571834564209, |
|
"kl": 0.99072265625, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": 0.0395, |
|
"reward": 0.20402609836310148, |
|
"reward_std": 0.2159460373222828, |
|
"rewards/cosine_scaled_reward": -0.028315742500126362, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 1615.9583740234375, |
|
"epoch": 0.44114285714285717, |
|
"grad_norm": 1.576268196105957, |
|
"kl": 1.1650390625, |
|
"learning_rate": 2.3515149676898552e-07, |
|
"loss": 0.0467, |
|
"reward": 0.17687237821519375, |
|
"reward_std": 0.1281549371778965, |
|
"rewards/cosine_scaled_reward": -0.10758153721690178, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 1551.5625610351562, |
|
"epoch": 0.4422857142857143, |
|
"grad_norm": 1.610855221748352, |
|
"kl": 0.573486328125, |
|
"learning_rate": 2.3291460551638237e-07, |
|
"loss": 0.0229, |
|
"reward": 0.1070600375533104, |
|
"reward_std": 0.16446635872125626, |
|
"rewards/cosine_scaled_reward": -0.22158217430114746, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 387 |
|
}, |
|
{ |
|
"completion_length": 1435.6458435058594, |
|
"epoch": 0.44342857142857145, |
|
"grad_norm": 2.12188720703125, |
|
"kl": 0.685302734375, |
|
"learning_rate": 2.306931685585657e-07, |
|
"loss": 0.0274, |
|
"reward": 0.20380490552634, |
|
"reward_std": 0.11574576422572136, |
|
"rewards/cosine_scaled_reward": -0.036669282941147685, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 1297.0417175292969, |
|
"epoch": 0.44457142857142856, |
|
"grad_norm": 1.524010181427002, |
|
"kl": 0.875732421875, |
|
"learning_rate": 2.2848729416523859e-07, |
|
"loss": 0.035, |
|
"reward": 0.2976585365831852, |
|
"reward_std": 0.23040159419178963, |
|
"rewards/cosine_scaled_reward": 0.12910030595958233, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 389 |
|
}, |
|
{ |
|
"completion_length": 1280.8541717529297, |
|
"epoch": 0.44571428571428573, |
|
"grad_norm": 0.8724046349525452, |
|
"kl": 0.746826171875, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.0298, |
|
"reward": 0.1727432645857334, |
|
"reward_std": 0.17489107139408588, |
|
"rewards/cosine_scaled_reward": -0.11648351605981588, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 1218.6458740234375, |
|
"epoch": 0.44685714285714284, |
|
"grad_norm": 1.9020179510116577, |
|
"kl": 0.556396484375, |
|
"learning_rate": 2.2412266235313973e-07, |
|
"loss": 0.0223, |
|
"reward": 0.13444999419152737, |
|
"reward_std": 0.1418241187930107, |
|
"rewards/cosine_scaled_reward": -0.2053442131727934, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 391 |
|
}, |
|
{ |
|
"completion_length": 1699.0833740234375, |
|
"epoch": 0.448, |
|
"grad_norm": 3.0778512954711914, |
|
"kl": 0.791015625, |
|
"learning_rate": 2.2196411766036487e-07, |
|
"loss": 0.0316, |
|
"reward": 0.23890820518136024, |
|
"reward_std": 0.3118077628314495, |
|
"rewards/cosine_scaled_reward": 0.07110257190652192, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 1719.2916870117188, |
|
"epoch": 0.4491428571428571, |
|
"grad_norm": 1.7322001457214355, |
|
"kl": 1.0498046875, |
|
"learning_rate": 2.1982156097370557e-07, |
|
"loss": 0.042, |
|
"reward": 0.2058930192142725, |
|
"reward_std": 0.2754768989980221, |
|
"rewards/cosine_scaled_reward": 0.014654617756605148, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 393 |
|
}, |
|
{ |
|
"completion_length": 1581.8958740234375, |
|
"epoch": 0.4502857142857143, |
|
"grad_norm": 1.7970198392868042, |
|
"kl": 0.75732421875, |
|
"learning_rate": 2.1769509671835223e-07, |
|
"loss": 0.0303, |
|
"reward": 0.1278714146465063, |
|
"reward_std": 0.16577338986098766, |
|
"rewards/cosine_scaled_reward": -0.18303980166092515, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 1615.8542175292969, |
|
"epoch": 0.4514285714285714, |
|
"grad_norm": 1.6947509050369263, |
|
"kl": 0.90771484375, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.0363, |
|
"reward": 0.16298719588667154, |
|
"reward_std": 0.1734189111739397, |
|
"rewards/cosine_scaled_reward": -0.09799596574157476, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 1059.1041870117188, |
|
"epoch": 0.45257142857142857, |
|
"grad_norm": 1.1382616758346558, |
|
"kl": 0.41558837890625, |
|
"learning_rate": 2.134908592756607e-07, |
|
"loss": 0.0166, |
|
"reward": 0.20291254110634327, |
|
"reward_std": 0.1326997596770525, |
|
"rewards/cosine_scaled_reward": -0.04992395639419556, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 1636.5625305175781, |
|
"epoch": 0.45371428571428574, |
|
"grad_norm": 1.6751741170883179, |
|
"kl": 1.0986328125, |
|
"learning_rate": 2.1141329099692406e-07, |
|
"loss": 0.044, |
|
"reward": 0.34289546124637127, |
|
"reward_std": 0.18598736822605133, |
|
"rewards/cosine_scaled_reward": 0.26703778095543385, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 397 |
|
}, |
|
{ |
|
"completion_length": 1436.0833435058594, |
|
"epoch": 0.45485714285714285, |
|
"grad_norm": 1.3158280849456787, |
|
"kl": 0.488494873046875, |
|
"learning_rate": 2.0935222495670968e-07, |
|
"loss": 0.0195, |
|
"reward": 0.18827892653644085, |
|
"reward_std": 0.18240927904844284, |
|
"rewards/cosine_scaled_reward": -0.06533949635922909, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 1633.2500610351562, |
|
"epoch": 0.456, |
|
"grad_norm": 1.3960447311401367, |
|
"kl": 0.5896148681640625, |
|
"learning_rate": 2.0730776160846853e-07, |
|
"loss": 0.0236, |
|
"reward": 0.15800490105175413, |
|
"reward_std": 0.14584489725530148, |
|
"rewards/cosine_scaled_reward": -0.07036726316437125, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 399 |
|
}, |
|
{ |
|
"completion_length": 1449.3333587646484, |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 2.858807325363159, |
|
"kl": 0.79345703125, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0317, |
|
"reward": 0.17510544694960117, |
|
"reward_std": 0.20562008023262024, |
|
"rewards/cosine_scaled_reward": -0.060673171654343605, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 1245.7083587646484, |
|
"epoch": 0.4582857142857143, |
|
"grad_norm": 3.399758815765381, |
|
"kl": 0.52587890625, |
|
"learning_rate": 2.032690407508949e-07, |
|
"loss": 0.021, |
|
"reward": 0.2145949751138687, |
|
"reward_std": 0.17422041855752468, |
|
"rewards/cosine_scaled_reward": 0.00034935586154460907, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 401 |
|
}, |
|
{ |
|
"completion_length": 2146.5625610351562, |
|
"epoch": 0.4594285714285714, |
|
"grad_norm": 2.2153425216674805, |
|
"kl": 1.408203125, |
|
"learning_rate": 2.0127498008311922e-07, |
|
"loss": 0.0564, |
|
"reward": 0.06373792514204979, |
|
"reward_std": 0.11548986099660397, |
|
"rewards/cosine_scaled_reward": -0.2942846156656742, |
|
"rewards/format_reward": 0.8333333730697632, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 1795.0833435058594, |
|
"epoch": 0.4605714285714286, |
|
"grad_norm": 2.41886043548584, |
|
"kl": 0.943359375, |
|
"learning_rate": 1.9929791578083655e-07, |
|
"loss": 0.0378, |
|
"reward": 0.12751809041947126, |
|
"reward_std": 0.174501184374094, |
|
"rewards/cosine_scaled_reward": -0.1311726775020361, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 403 |
|
}, |
|
{ |
|
"completion_length": 1662.2500610351562, |
|
"epoch": 0.4617142857142857, |
|
"grad_norm": 1.9953243732452393, |
|
"kl": 0.9521484375, |
|
"learning_rate": 1.9733794420337213e-07, |
|
"loss": 0.0381, |
|
"reward": 0.21517368033528328, |
|
"reward_std": 0.18545049242675304, |
|
"rewards/cosine_scaled_reward": 0.005411209538578987, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 404 |
|
}, |
|
{ |
|
"completion_length": 1612.9167175292969, |
|
"epoch": 0.46285714285714286, |
|
"grad_norm": 1.684091567993164, |
|
"kl": 0.84716796875, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": 0.0339, |
|
"reward": 0.2171931341290474, |
|
"reward_std": 0.18458830192685127, |
|
"rewards/cosine_scaled_reward": 0.0182991623878479, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 1634.7709045410156, |
|
"epoch": 0.464, |
|
"grad_norm": 2.3493168354034424, |
|
"kl": 1.029296875, |
|
"learning_rate": 1.934696604901642e-07, |
|
"loss": 0.0412, |
|
"reward": 0.19101523607969284, |
|
"reward_std": 0.16082188487052917, |
|
"rewards/cosine_scaled_reward": -0.06326993182301521, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 406 |
|
}, |
|
{ |
|
"completion_length": 1121.5416870117188, |
|
"epoch": 0.46514285714285714, |
|
"grad_norm": 2.3104114532470703, |
|
"kl": 0.4576416015625, |
|
"learning_rate": 1.915615368891117e-07, |
|
"loss": 0.0183, |
|
"reward": 0.21706481464207172, |
|
"reward_std": 0.14366690441966057, |
|
"rewards/cosine_scaled_reward": -0.025802362710237503, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 407 |
|
}, |
|
{ |
|
"completion_length": 1074.9791870117188, |
|
"epoch": 0.4662857142857143, |
|
"grad_norm": 1.874539852142334, |
|
"kl": 0.44500732421875, |
|
"learning_rate": 1.8967088307307e-07, |
|
"loss": 0.0178, |
|
"reward": 0.23207461088895798, |
|
"reward_std": 0.20729456096887589, |
|
"rewards/cosine_scaled_reward": -0.004256272688508034, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 1555.0833740234375, |
|
"epoch": 0.4674285714285714, |
|
"grad_norm": 2.038965940475464, |
|
"kl": 1.02734375, |
|
"learning_rate": 1.8779779118983867e-07, |
|
"loss": 0.0411, |
|
"reward": 0.26453279703855515, |
|
"reward_std": 0.19898507371544838, |
|
"rewards/cosine_scaled_reward": 0.09699833486229181, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 409 |
|
}, |
|
{ |
|
"completion_length": 1686.0416870117188, |
|
"epoch": 0.4685714285714286, |
|
"grad_norm": 2.4619529247283936, |
|
"kl": 1.145263671875, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.0459, |
|
"reward": 0.17928657121956348, |
|
"reward_std": 0.18281647004187107, |
|
"rewards/cosine_scaled_reward": -0.05253279022872448, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 1760.2084350585938, |
|
"epoch": 0.4697142857142857, |
|
"grad_norm": 2.5391340255737305, |
|
"kl": 1.0654296875, |
|
"learning_rate": 1.8410465752883758e-07, |
|
"loss": 0.0426, |
|
"reward": 0.08570251986384392, |
|
"reward_std": 0.2112545520067215, |
|
"rewards/cosine_scaled_reward": -0.18753627687692642, |
|
"rewards/format_reward": 0.708333358168602, |
|
"step": 411 |
|
}, |
|
{ |
|
"completion_length": 1102.8542175292969, |
|
"epoch": 0.47085714285714286, |
|
"grad_norm": 0.7402982115745544, |
|
"kl": 0.23406982421875, |
|
"learning_rate": 1.822847957491922e-07, |
|
"loss": 0.0094, |
|
"reward": 0.16945463605225086, |
|
"reward_std": 0.13772385567426682, |
|
"rewards/cosine_scaled_reward": -0.15359408780932426, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 412 |
|
}, |
|
{ |
|
"completion_length": 1526.5208740234375, |
|
"epoch": 0.472, |
|
"grad_norm": 2.8514583110809326, |
|
"kl": 0.84326171875, |
|
"learning_rate": 1.804828558898332e-07, |
|
"loss": 0.0338, |
|
"reward": 0.07859875238500535, |
|
"reward_std": 0.14428682066500187, |
|
"rewards/cosine_scaled_reward": -0.23446397110819817, |
|
"rewards/format_reward": 0.7708333432674408, |
|
"step": 413 |
|
}, |
|
{ |
|
"completion_length": 1553.0208740234375, |
|
"epoch": 0.47314285714285714, |
|
"grad_norm": 1.3336910009384155, |
|
"kl": 1.005615234375, |
|
"learning_rate": 1.7869892577476722e-07, |
|
"loss": 0.0403, |
|
"reward": 0.24459942057728767, |
|
"reward_std": 0.1802833005785942, |
|
"rewards/cosine_scaled_reward": 0.06333907938096672, |
|
"rewards/format_reward": 0.8125, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 1626.3334045410156, |
|
"epoch": 0.4742857142857143, |
|
"grad_norm": 2.653970718383789, |
|
"kl": 0.87548828125, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.035, |
|
"reward": 0.25224715657532215, |
|
"reward_std": 0.24098404496908188, |
|
"rewards/cosine_scaled_reward": 0.05381806939840317, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 1558.2083740234375, |
|
"epoch": 0.4754285714285714, |
|
"grad_norm": 2.744907855987549, |
|
"kl": 0.89990234375, |
|
"learning_rate": 1.7518544168045524e-07, |
|
"loss": 0.0359, |
|
"reward": 0.16436111507937312, |
|
"reward_std": 0.18552083894610405, |
|
"rewards/cosine_scaled_reward": -0.0836386177688837, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 416 |
|
}, |
|
{ |
|
"completion_length": 1913.791748046875, |
|
"epoch": 0.4765714285714286, |
|
"grad_norm": 2.4131460189819336, |
|
"kl": 1.46630859375, |
|
"learning_rate": 1.7345605894346726e-07, |
|
"loss": 0.0587, |
|
"reward": 0.10155561100691557, |
|
"reward_std": 0.19174444302916527, |
|
"rewards/cosine_scaled_reward": -0.15042966604232788, |
|
"rewards/format_reward": 0.6875000149011612, |
|
"step": 417 |
|
}, |
|
{ |
|
"completion_length": 1520.8333740234375, |
|
"epoch": 0.4777142857142857, |
|
"grad_norm": 3.986926317214966, |
|
"kl": 1.1474609375, |
|
"learning_rate": 1.7174502842694212e-07, |
|
"loss": 0.0459, |
|
"reward": 0.1306799417361617, |
|
"reward_std": 0.1696070432662964, |
|
"rewards/cosine_scaled_reward": -0.15072840079665184, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 418 |
|
}, |
|
{ |
|
"completion_length": 1411.2500610351562, |
|
"epoch": 0.47885714285714287, |
|
"grad_norm": 1.6700776815414429, |
|
"kl": 0.77392578125, |
|
"learning_rate": 1.7005243352409333e-07, |
|
"loss": 0.0309, |
|
"reward": 0.11982119083404541, |
|
"reward_std": 0.14616922289133072, |
|
"rewards/cosine_scaled_reward": -0.2155298045836389, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 419 |
|
}, |
|
{ |
|
"completion_length": 1086.3750457763672, |
|
"epoch": 0.48, |
|
"grad_norm": 2.8618907928466797, |
|
"kl": 0.39306640625, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0157, |
|
"reward": 0.2151604052633047, |
|
"reward_std": 0.20470884442329407, |
|
"rewards/cosine_scaled_reward": -0.01572578027844429, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 1218.75, |
|
"epoch": 0.48114285714285715, |
|
"grad_norm": 1.8824008703231812, |
|
"kl": 0.43896484375, |
|
"learning_rate": 1.6672287963562852e-07, |
|
"loss": 0.0175, |
|
"reward": 0.27937930822372437, |
|
"reward_std": 0.15520622581243515, |
|
"rewards/cosine_scaled_reward": 0.03862675465643406, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 421 |
|
}, |
|
{ |
|
"completion_length": 1495.3333740234375, |
|
"epoch": 0.48228571428571426, |
|
"grad_norm": 1.9558619260787964, |
|
"kl": 0.87109375, |
|
"learning_rate": 1.6508608292777203e-07, |
|
"loss": 0.0348, |
|
"reward": 0.18802766874432564, |
|
"reward_std": 0.15786195173859596, |
|
"rewards/cosine_scaled_reward": -0.1094935517758131, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 422 |
|
}, |
|
{ |
|
"completion_length": 1453.8125457763672, |
|
"epoch": 0.48342857142857143, |
|
"grad_norm": 3.1225712299346924, |
|
"kl": 0.58935546875, |
|
"learning_rate": 1.6346804638120098e-07, |
|
"loss": 0.0235, |
|
"reward": 0.1692509911954403, |
|
"reward_std": 0.1672309935092926, |
|
"rewards/cosine_scaled_reward": -0.09207071270793676, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 423 |
|
}, |
|
{ |
|
"completion_length": 1574.9583740234375, |
|
"epoch": 0.4845714285714286, |
|
"grad_norm": 1.3268849849700928, |
|
"kl": 0.6416015625, |
|
"learning_rate": 1.6186884885673413e-07, |
|
"loss": 0.0256, |
|
"reward": 0.20429514534771442, |
|
"reward_std": 0.14665967971086502, |
|
"rewards/cosine_scaled_reward": -0.08527943585067987, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 424 |
|
}, |
|
{ |
|
"completion_length": 1544.3542175292969, |
|
"epoch": 0.4857142857142857, |
|
"grad_norm": 2.838041067123413, |
|
"kl": 0.59716796875, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.0239, |
|
"reward": 0.15881402208469808, |
|
"reward_std": 0.23204398341476917, |
|
"rewards/cosine_scaled_reward": -0.10290774330496788, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 1421.0208740234375, |
|
"epoch": 0.4868571428571429, |
|
"grad_norm": 2.232002019882202, |
|
"kl": 0.671875, |
|
"learning_rate": 1.5872728172265146e-07, |
|
"loss": 0.0269, |
|
"reward": 0.16967828944325447, |
|
"reward_std": 0.22798865288496017, |
|
"rewards/cosine_scaled_reward": -0.06918483227491379, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 1699.3125610351562, |
|
"epoch": 0.488, |
|
"grad_norm": 1.7416229248046875, |
|
"kl": 1.1943359375, |
|
"learning_rate": 1.5718506522858572e-07, |
|
"loss": 0.0478, |
|
"reward": 0.16669721342623234, |
|
"reward_std": 0.23273145407438278, |
|
"rewards/cosine_scaled_reward": -0.0462256595492363, |
|
"rewards/format_reward": 0.7291666865348816, |
|
"step": 427 |
|
}, |
|
{ |
|
"completion_length": 1743.4375610351562, |
|
"epoch": 0.48914285714285716, |
|
"grad_norm": 1.2741050720214844, |
|
"kl": 0.7529296875, |
|
"learning_rate": 1.5566199398026147e-07, |
|
"loss": 0.0301, |
|
"reward": 0.1672147586941719, |
|
"reward_std": 0.12992947921156883, |
|
"rewards/cosine_scaled_reward": -0.1202060398645699, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 428 |
|
}, |
|
{ |
|
"completion_length": 1644.7916870117188, |
|
"epoch": 0.49028571428571427, |
|
"grad_norm": 2.6569628715515137, |
|
"kl": 0.9384765625, |
|
"learning_rate": 1.5415814221002265e-07, |
|
"loss": 0.0375, |
|
"reward": 0.19867986720055342, |
|
"reward_std": 0.2007097192108631, |
|
"rewards/cosine_scaled_reward": -0.02384123019874096, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 429 |
|
}, |
|
{ |
|
"completion_length": 1961.1666870117188, |
|
"epoch": 0.49142857142857144, |
|
"grad_norm": 1.7140686511993408, |
|
"kl": 1.21484375, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0486, |
|
"reward": 0.10232608858495951, |
|
"reward_std": 0.16942250356078148, |
|
"rewards/cosine_scaled_reward": -0.09778407961130142, |
|
"rewards/format_reward": 0.5833333656191826, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 1212.3333740234375, |
|
"epoch": 0.49257142857142855, |
|
"grad_norm": 2.095090866088867, |
|
"kl": 0.676513671875, |
|
"learning_rate": 1.5120838934595337e-07, |
|
"loss": 0.027, |
|
"reward": 0.25252123549580574, |
|
"reward_std": 0.17253945022821426, |
|
"rewards/cosine_scaled_reward": 0.02987060695886612, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 431 |
|
}, |
|
{ |
|
"completion_length": 1627.5833740234375, |
|
"epoch": 0.4937142857142857, |
|
"grad_norm": 1.731967806816101, |
|
"kl": 1.109375, |
|
"learning_rate": 1.4976263201891613e-07, |
|
"loss": 0.0444, |
|
"reward": 0.1790441758930683, |
|
"reward_std": 0.21966011822223663, |
|
"rewards/cosine_scaled_reward": -0.07679930981248617, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 1566.9583587646484, |
|
"epoch": 0.4948571428571429, |
|
"grad_norm": 2.4567792415618896, |
|
"kl": 0.8028564453125, |
|
"learning_rate": 1.483363816965435e-07, |
|
"loss": 0.0321, |
|
"reward": 0.16420520408428274, |
|
"reward_std": 0.19683999940752983, |
|
"rewards/cosine_scaled_reward": -0.123224092181772, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 433 |
|
}, |
|
{ |
|
"completion_length": 982.0000610351562, |
|
"epoch": 0.496, |
|
"grad_norm": 19.401472091674805, |
|
"kl": 0.561065673828125, |
|
"learning_rate": 1.469297078922642e-07, |
|
"loss": 0.0225, |
|
"reward": 0.32596323639154434, |
|
"reward_std": 0.19744369760155678, |
|
"rewards/cosine_scaled_reward": 0.14631427451968193, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 434 |
|
}, |
|
{ |
|
"completion_length": 1534.7917175292969, |
|
"epoch": 0.49714285714285716, |
|
"grad_norm": 6.2816362380981445, |
|
"kl": 1.1749267578125, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.0471, |
|
"reward": 0.13189730420708656, |
|
"reward_std": 0.14833365753293037, |
|
"rewards/cosine_scaled_reward": -0.17807801440358162, |
|
"rewards/format_reward": 0.8541666716337204, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 1608.6875305175781, |
|
"epoch": 0.4982857142857143, |
|
"grad_norm": 1.2652208805084229, |
|
"kl": 0.8369140625, |
|
"learning_rate": 1.4417536311769885e-07, |
|
"loss": 0.0334, |
|
"reward": 0.1510629504919052, |
|
"reward_std": 0.20369192957878113, |
|
"rewards/cosine_scaled_reward": -0.11619596276432276, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 436 |
|
}, |
|
{ |
|
"completion_length": 1255.1458740234375, |
|
"epoch": 0.49942857142857144, |
|
"grad_norm": 1.7958463430404663, |
|
"kl": 0.46435546875, |
|
"learning_rate": 1.4282782639029128e-07, |
|
"loss": 0.0186, |
|
"reward": 0.23130175843834877, |
|
"reward_std": 0.1911556702107191, |
|
"rewards/cosine_scaled_reward": -0.006543227471411228, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 437 |
|
}, |
|
{ |
|
"completion_length": 1066.3333740234375, |
|
"epoch": 0.5005714285714286, |
|
"grad_norm": 2.252562999725342, |
|
"kl": 0.41064453125, |
|
"learning_rate": 1.4150013466019114e-07, |
|
"loss": 0.0164, |
|
"reward": 0.21582691743969917, |
|
"reward_std": 0.18366244062781334, |
|
"rewards/cosine_scaled_reward": -0.07878882065415382, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 1176.6667022705078, |
|
"epoch": 0.5017142857142857, |
|
"grad_norm": 1.840851068496704, |
|
"kl": 0.83984375, |
|
"learning_rate": 1.4019235263722034e-07, |
|
"loss": 0.0336, |
|
"reward": 0.1915587205439806, |
|
"reward_std": 0.15870841406285763, |
|
"rewards/cosine_scaled_reward": -0.08411563746631145, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 439 |
|
}, |
|
{ |
|
"completion_length": 1799.9583740234375, |
|
"epoch": 0.5028571428571429, |
|
"grad_norm": 1.910586953163147, |
|
"kl": 1.419921875, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0569, |
|
"reward": 0.15009203157387674, |
|
"reward_std": 0.16570111364126205, |
|
"rewards/cosine_scaled_reward": -0.08884701132774353, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 1417.562515258789, |
|
"epoch": 0.504, |
|
"grad_norm": 2.846151828765869, |
|
"kl": 0.65283203125, |
|
"learning_rate": 1.3763677169699217e-07, |
|
"loss": 0.0261, |
|
"reward": 0.25939593836665154, |
|
"reward_std": 0.1876070685684681, |
|
"rewards/cosine_scaled_reward": 0.09080945514142513, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 441 |
|
}, |
|
{ |
|
"completion_length": 1395.0417175292969, |
|
"epoch": 0.5051428571428571, |
|
"grad_norm": 1.3554480075836182, |
|
"kl": 0.4969482421875, |
|
"learning_rate": 1.3638909733514452e-07, |
|
"loss": 0.0199, |
|
"reward": 0.1746810134500265, |
|
"reward_std": 0.18113730661571026, |
|
"rewards/cosine_scaled_reward": -0.11311907507479191, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 442 |
|
}, |
|
{ |
|
"completion_length": 1326.6250305175781, |
|
"epoch": 0.5062857142857143, |
|
"grad_norm": 2.2575602531433105, |
|
"kl": 0.5147705078125, |
|
"learning_rate": 1.351615817851748e-07, |
|
"loss": 0.0206, |
|
"reward": 0.17062465287745, |
|
"reward_std": 0.17788580060005188, |
|
"rewards/cosine_scaled_reward": -0.12135545909404755, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 443 |
|
}, |
|
{ |
|
"completion_length": 1452.4375305175781, |
|
"epoch": 0.5074285714285715, |
|
"grad_norm": 2.0496582984924316, |
|
"kl": 0.669921875, |
|
"learning_rate": 1.3395428487445914e-07, |
|
"loss": 0.0267, |
|
"reward": 0.21649761497974396, |
|
"reward_std": 0.1603663358837366, |
|
"rewards/cosine_scaled_reward": -0.0013428553938865662, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 1655.3750610351562, |
|
"epoch": 0.5085714285714286, |
|
"grad_norm": 1.7483857870101929, |
|
"kl": 1.0380859375, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.0415, |
|
"reward": 0.14976151008158922, |
|
"reward_std": 0.12271312810480595, |
|
"rewards/cosine_scaled_reward": -0.10680487379431725, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 1138.9375457763672, |
|
"epoch": 0.5097142857142857, |
|
"grad_norm": 1.3604767322540283, |
|
"kl": 0.5885009765625, |
|
"learning_rate": 1.316005813502869e-07, |
|
"loss": 0.0235, |
|
"reward": 0.20577935874462128, |
|
"reward_std": 0.19339029118418694, |
|
"rewards/cosine_scaled_reward": -0.0911331009119749, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 446 |
|
}, |
|
{ |
|
"completion_length": 1263.1250610351562, |
|
"epoch": 0.5108571428571429, |
|
"grad_norm": 1.865720510482788, |
|
"kl": 1.154052734375, |
|
"learning_rate": 1.3045428945301953e-07, |
|
"loss": 0.0462, |
|
"reward": 0.14829288161126897, |
|
"reward_std": 0.17679531127214432, |
|
"rewards/cosine_scaled_reward": -0.14904707111418247, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 447 |
|
}, |
|
{ |
|
"completion_length": 1118.375015258789, |
|
"epoch": 0.512, |
|
"grad_norm": 1.1738076210021973, |
|
"kl": 0.08587646484375, |
|
"learning_rate": 1.2932844562179352e-07, |
|
"loss": 0.0034, |
|
"reward": 0.29666774719953537, |
|
"reward_std": 0.2335982620716095, |
|
"rewards/cosine_scaled_reward": 0.07708277204073966, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 448 |
|
}, |
|
{ |
|
"completion_length": 1263.687515258789, |
|
"epoch": 0.5131428571428571, |
|
"grad_norm": 2.511007308959961, |
|
"kl": 0.743408203125, |
|
"learning_rate": 1.2822310472864885e-07, |
|
"loss": 0.0297, |
|
"reward": 0.17201264947652817, |
|
"reward_std": 0.2096976675093174, |
|
"rewards/cosine_scaled_reward": -0.11413275334052742, |
|
"rewards/format_reward": 0.8750000298023224, |
|
"step": 449 |
|
}, |
|
{ |
|
"completion_length": 1255.812515258789, |
|
"epoch": 0.5142857142857142, |
|
"grad_norm": 3.0915615558624268, |
|
"kl": 1.0966796875, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.044, |
|
"reward": 0.19247347861528397, |
|
"reward_std": 0.2065977193415165, |
|
"rewards/cosine_scaled_reward": -0.023884066613391042, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 1148.1667175292969, |
|
"epoch": 0.5154285714285715, |
|
"grad_norm": 2.033189296722412, |
|
"kl": 0.53155517578125, |
|
"learning_rate": 1.260741462457165e-07, |
|
"loss": 0.0212, |
|
"reward": 0.25062017887830734, |
|
"reward_std": 0.1323441956192255, |
|
"rewards/cosine_scaled_reward": 0.031208358705043793, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 451 |
|
}, |
|
{ |
|
"completion_length": 1149.8542022705078, |
|
"epoch": 0.5165714285714286, |
|
"grad_norm": 2.3381247520446777, |
|
"kl": 0.7998046875, |
|
"learning_rate": 1.2503063339313356e-07, |
|
"loss": 0.032, |
|
"reward": 0.09467406664043665, |
|
"reward_std": 0.1573140937834978, |
|
"rewards/cosine_scaled_reward": -0.23562941327691078, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 452 |
|
}, |
|
{ |
|
"completion_length": 1543.8542175292969, |
|
"epoch": 0.5177142857142857, |
|
"grad_norm": 3.0497689247131348, |
|
"kl": 1.39990234375, |
|
"learning_rate": 1.2400783294793668e-07, |
|
"loss": 0.0559, |
|
"reward": 0.23793689720332623, |
|
"reward_std": 0.24361500516533852, |
|
"rewards/cosine_scaled_reward": 0.05131397116929293, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 453 |
|
}, |
|
{ |
|
"completion_length": 995.3333740234375, |
|
"epoch": 0.5188571428571429, |
|
"grad_norm": 2.2683708667755127, |
|
"kl": 0.62646484375, |
|
"learning_rate": 1.2300579475997657e-07, |
|
"loss": 0.0251, |
|
"reward": 0.16244725324213505, |
|
"reward_std": 0.1817509774118662, |
|
"rewards/cosine_scaled_reward": -0.1395284836180508, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 454 |
|
}, |
|
{ |
|
"completion_length": 1405.8333435058594, |
|
"epoch": 0.52, |
|
"grad_norm": 1.8915313482284546, |
|
"kl": 1.2333984375, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0494, |
|
"reward": 0.15508674364537, |
|
"reward_std": 0.15496904775500298, |
|
"rewards/cosine_scaled_reward": -0.12304865941405296, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 1620.3750305175781, |
|
"epoch": 0.5211428571428571, |
|
"grad_norm": 2.3792831897735596, |
|
"kl": 0.9970703125, |
|
"learning_rate": 1.2106419949317388e-07, |
|
"loss": 0.0399, |
|
"reward": 0.19286850281059742, |
|
"reward_std": 0.1363586913794279, |
|
"rewards/cosine_scaled_reward": -0.02547831228002906, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 456 |
|
}, |
|
{ |
|
"completion_length": 1495.729248046875, |
|
"epoch": 0.5222857142857142, |
|
"grad_norm": 2.826035737991333, |
|
"kl": 1.41796875, |
|
"learning_rate": 1.2012473704494537e-07, |
|
"loss": 0.0568, |
|
"reward": 0.18930382770486176, |
|
"reward_std": 0.19817211106419563, |
|
"rewards/cosine_scaled_reward": -0.011334592942148447, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 457 |
|
}, |
|
{ |
|
"completion_length": 1296.2708740234375, |
|
"epoch": 0.5234285714285715, |
|
"grad_norm": 2.695727586746216, |
|
"kl": 0.623291015625, |
|
"learning_rate": 1.1920622611056974e-07, |
|
"loss": 0.025, |
|
"reward": 0.21301186457276344, |
|
"reward_std": 0.2375541441142559, |
|
"rewards/cosine_scaled_reward": -0.030085243575740606, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 458 |
|
}, |
|
{ |
|
"completion_length": 1180.8958740234375, |
|
"epoch": 0.5245714285714286, |
|
"grad_norm": 1.4999505281448364, |
|
"kl": 0.8074951171875, |
|
"learning_rate": 1.1830871145697412e-07, |
|
"loss": 0.0323, |
|
"reward": 0.19034619256854057, |
|
"reward_std": 0.23577242344617844, |
|
"rewards/cosine_scaled_reward": -0.04188129701651633, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 459 |
|
}, |
|
{ |
|
"completion_length": 1674.2500610351562, |
|
"epoch": 0.5257142857142857, |
|
"grad_norm": 3.6569175720214844, |
|
"kl": 1.400390625, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0561, |
|
"reward": 0.20811645686626434, |
|
"reward_std": 0.24132521450519562, |
|
"rewards/cosine_scaled_reward": -0.020151358097791672, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 1561.0625305175781, |
|
"epoch": 0.5268571428571428, |
|
"grad_norm": 2.9864399433135986, |
|
"kl": 0.868896484375, |
|
"learning_rate": 1.1657684494105386e-07, |
|
"loss": 0.0348, |
|
"reward": 0.23855134937912226, |
|
"reward_std": 0.190349493175745, |
|
"rewards/cosine_scaled_reward": 0.05257879290729761, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 461 |
|
}, |
|
{ |
|
"completion_length": 1403.0417022705078, |
|
"epoch": 0.528, |
|
"grad_norm": 2.8803505897521973, |
|
"kl": 0.5537109375, |
|
"learning_rate": 1.1574257748745986e-07, |
|
"loss": 0.0222, |
|
"reward": 0.1879023276269436, |
|
"reward_std": 0.17408592253923416, |
|
"rewards/cosine_scaled_reward": -0.11774074472486973, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 462 |
|
}, |
|
{ |
|
"completion_length": 1038.3958740234375, |
|
"epoch": 0.5291428571428571, |
|
"grad_norm": 2.510594367980957, |
|
"kl": 0.8834228515625, |
|
"learning_rate": 1.1492947512799328e-07, |
|
"loss": 0.0353, |
|
"reward": 0.2329028071835637, |
|
"reward_std": 0.1337026245892048, |
|
"rewards/cosine_scaled_reward": 0.04953182302415371, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 463 |
|
}, |
|
{ |
|
"completion_length": 1624.3959045410156, |
|
"epoch": 0.5302857142857142, |
|
"grad_norm": 2.223735809326172, |
|
"kl": 0.958251953125, |
|
"learning_rate": 1.1413757749211602e-07, |
|
"loss": 0.0383, |
|
"reward": 0.24998359940946102, |
|
"reward_std": 0.1796153038740158, |
|
"rewards/cosine_scaled_reward": 0.052480582147836685, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 464 |
|
}, |
|
{ |
|
"completion_length": 1534.479248046875, |
|
"epoch": 0.5314285714285715, |
|
"grad_norm": 1.6499871015548706, |
|
"kl": 1.323486328125, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.0531, |
|
"reward": 0.1806503850966692, |
|
"reward_std": 0.18742095679044724, |
|
"rewards/cosine_scaled_reward": -0.052027489989995956, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 1744.5209045410156, |
|
"epoch": 0.5325714285714286, |
|
"grad_norm": 1.7625157833099365, |
|
"kl": 0.74755859375, |
|
"learning_rate": 1.1261754973965422e-07, |
|
"loss": 0.0299, |
|
"reward": 0.12266920693218708, |
|
"reward_std": 0.14947674423456192, |
|
"rewards/cosine_scaled_reward": -0.1689941380172968, |
|
"rewards/format_reward": 0.8125000298023224, |
|
"step": 466 |
|
}, |
|
{ |
|
"completion_length": 1490.854232788086, |
|
"epoch": 0.5337142857142857, |
|
"grad_norm": 2.6080636978149414, |
|
"kl": 1.408203125, |
|
"learning_rate": 1.1188949370707787e-07, |
|
"loss": 0.0562, |
|
"reward": 0.1923494804650545, |
|
"reward_std": 0.18007073551416397, |
|
"rewards/cosine_scaled_reward": -0.0493585430085659, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 467 |
|
}, |
|
{ |
|
"completion_length": 1635.5001068115234, |
|
"epoch": 0.5348571428571428, |
|
"grad_norm": 5.10135555267334, |
|
"kl": 1.4853515625, |
|
"learning_rate": 1.1118279056249653e-07, |
|
"loss": 0.0594, |
|
"reward": 0.2753597334958613, |
|
"reward_std": 0.18230173736810684, |
|
"rewards/cosine_scaled_reward": 0.1316775605082512, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 468 |
|
}, |
|
{ |
|
"completion_length": 1196.0208740234375, |
|
"epoch": 0.536, |
|
"grad_norm": 3.6090939044952393, |
|
"kl": 0.96044921875, |
|
"learning_rate": 1.1049747474962444e-07, |
|
"loss": 0.0385, |
|
"reward": 0.15217299573123455, |
|
"reward_std": 0.16991863399744034, |
|
"rewards/cosine_scaled_reward": -0.15571825858205557, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 469 |
|
}, |
|
{ |
|
"completion_length": 1328.2917022705078, |
|
"epoch": 0.5371428571428571, |
|
"grad_norm": 2.7696914672851562, |
|
"kl": 1.2490234375, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.05, |
|
"reward": 0.2623746544122696, |
|
"reward_std": 0.1682613156735897, |
|
"rewards/cosine_scaled_reward": 0.06257599592208862, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 1151.2500305175781, |
|
"epoch": 0.5382857142857143, |
|
"grad_norm": 2.2605948448181152, |
|
"kl": 1.0380859375, |
|
"learning_rate": 1.0919113768029517e-07, |
|
"loss": 0.0415, |
|
"reward": 0.1418502125889063, |
|
"reward_std": 0.1402505859732628, |
|
"rewards/cosine_scaled_reward": -0.16590357944369316, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 471 |
|
}, |
|
{ |
|
"completion_length": 881.2916717529297, |
|
"epoch": 0.5394285714285715, |
|
"grad_norm": 4.6680588722229, |
|
"kl": 0.572265625, |
|
"learning_rate": 1.0857018009286381e-07, |
|
"loss": 0.0229, |
|
"reward": 0.2540171667933464, |
|
"reward_std": 0.21371759288012981, |
|
"rewards/cosine_scaled_reward": 0.0268446896225214, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 472 |
|
}, |
|
{ |
|
"completion_length": 1405.8958740234375, |
|
"epoch": 0.5405714285714286, |
|
"grad_norm": 3.778167963027954, |
|
"kl": 1.0146484375, |
|
"learning_rate": 1.0797073717209013e-07, |
|
"loss": 0.0405, |
|
"reward": 0.08911328506655991, |
|
"reward_std": 0.14409188739955425, |
|
"rewards/cosine_scaled_reward": -0.25133184157311916, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 473 |
|
}, |
|
{ |
|
"completion_length": 1369.7083892822266, |
|
"epoch": 0.5417142857142857, |
|
"grad_norm": 2.8626537322998047, |
|
"kl": 0.7666015625, |
|
"learning_rate": 1.0739283813397639e-07, |
|
"loss": 0.0307, |
|
"reward": 0.16908735921606421, |
|
"reward_std": 0.17285825684666634, |
|
"rewards/cosine_scaled_reward": -0.10159570351243019, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 474 |
|
}, |
|
{ |
|
"completion_length": 1265.7083740234375, |
|
"epoch": 0.5428571428571428, |
|
"grad_norm": 1.8364455699920654, |
|
"kl": 0.827392578125, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.0331, |
|
"reward": 0.19515487863100134, |
|
"reward_std": 0.15950417518615723, |
|
"rewards/cosine_scaled_reward": -0.08009354583919048, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 475 |
|
}, |
|
{ |
|
"completion_length": 1177.2917022705078, |
|
"epoch": 0.544, |
|
"grad_norm": 3.9011993408203125, |
|
"kl": 0.51513671875, |
|
"learning_rate": 1.063017833182728e-07, |
|
"loss": 0.0206, |
|
"reward": 0.2876299601048231, |
|
"reward_std": 0.1502437572926283, |
|
"rewards/cosine_scaled_reward": 0.12480364367365837, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 476 |
|
}, |
|
{ |
|
"completion_length": 1044.687515258789, |
|
"epoch": 0.5451428571428572, |
|
"grad_norm": 1.5102007389068604, |
|
"kl": 0.4571533203125, |
|
"learning_rate": 1.0578868071715544e-07, |
|
"loss": 0.0183, |
|
"reward": 0.2441038154065609, |
|
"reward_std": 0.17297399789094925, |
|
"rewards/cosine_scaled_reward": 0.020557187497615814, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 477 |
|
}, |
|
{ |
|
"completion_length": 1297.6875457763672, |
|
"epoch": 0.5462857142857143, |
|
"grad_norm": 2.5546422004699707, |
|
"kl": 0.71649169921875, |
|
"learning_rate": 1.0529722834905125e-07, |
|
"loss": 0.0286, |
|
"reward": 0.17578930966556072, |
|
"reward_std": 0.11584887467324734, |
|
"rewards/cosine_scaled_reward": -0.1447231061756611, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 478 |
|
}, |
|
{ |
|
"completion_length": 1317.3542022705078, |
|
"epoch": 0.5474285714285714, |
|
"grad_norm": 2.733382225036621, |
|
"kl": 0.9921875, |
|
"learning_rate": 1.0482745016665526e-07, |
|
"loss": 0.0398, |
|
"reward": 0.2549874298274517, |
|
"reward_std": 0.23050136864185333, |
|
"rewards/cosine_scaled_reward": 0.07073704898357391, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 479 |
|
}, |
|
{ |
|
"completion_length": 1605.8333740234375, |
|
"epoch": 0.5485714285714286, |
|
"grad_norm": 2.0302486419677734, |
|
"kl": 0.89404296875, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.0358, |
|
"reward": 0.19943943247199059, |
|
"reward_std": 0.2432672716677189, |
|
"rewards/cosine_scaled_reward": -0.05472554266452789, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 1229.3750305175781, |
|
"epoch": 0.5497142857142857, |
|
"grad_norm": 3.4060001373291016, |
|
"kl": 0.708984375, |
|
"learning_rate": 1.0395300688680625e-07, |
|
"loss": 0.0283, |
|
"reward": 0.13737879018299282, |
|
"reward_std": 0.14800015836954117, |
|
"rewards/cosine_scaled_reward": -0.18615208379924297, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 481 |
|
}, |
|
{ |
|
"completion_length": 1038.5625305175781, |
|
"epoch": 0.5508571428571428, |
|
"grad_norm": 1.422458291053772, |
|
"kl": 0.296142578125, |
|
"learning_rate": 1.0354838440848501e-07, |
|
"loss": 0.0118, |
|
"reward": 0.2588203465566039, |
|
"reward_std": 0.14372991025447845, |
|
"rewards/cosine_scaled_reward": 0.014510583132505417, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 482 |
|
}, |
|
{ |
|
"completion_length": 1331.8542175292969, |
|
"epoch": 0.552, |
|
"grad_norm": 4.249495029449463, |
|
"kl": 1.3828125, |
|
"learning_rate": 1.0316552135205837e-07, |
|
"loss": 0.0552, |
|
"reward": 0.25617819651961327, |
|
"reward_std": 0.18873486295342445, |
|
"rewards/cosine_scaled_reward": 0.04960942268371582, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 483 |
|
}, |
|
{ |
|
"completion_length": 1384.0833740234375, |
|
"epoch": 0.5531428571428572, |
|
"grad_norm": 2.3114612102508545, |
|
"kl": 0.849609375, |
|
"learning_rate": 1.0280443637773163e-07, |
|
"loss": 0.034, |
|
"reward": 0.28373695723712444, |
|
"reward_std": 0.23551687598228455, |
|
"rewards/cosine_scaled_reward": 0.09562918171286583, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 484 |
|
}, |
|
{ |
|
"completion_length": 1479.2292022705078, |
|
"epoch": 0.5542857142857143, |
|
"grad_norm": 2.420201063156128, |
|
"kl": 1.4007568359375, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": 0.056, |
|
"reward": 0.160817326977849, |
|
"reward_std": 0.18855691701173782, |
|
"rewards/cosine_scaled_reward": -0.10896717384457588, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 485 |
|
}, |
|
{ |
|
"completion_length": 1231.2292022705078, |
|
"epoch": 0.5554285714285714, |
|
"grad_norm": 2.2929489612579346, |
|
"kl": 0.67431640625, |
|
"learning_rate": 1.0214767000817596e-07, |
|
"loss": 0.027, |
|
"reward": 0.27992733381688595, |
|
"reward_std": 0.1733872890472412, |
|
"rewards/cosine_scaled_reward": 0.0710078589618206, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 486 |
|
}, |
|
{ |
|
"completion_length": 1449.291748046875, |
|
"epoch": 0.5565714285714286, |
|
"grad_norm": 4.631397724151611, |
|
"kl": 0.955078125, |
|
"learning_rate": 1.0185202062281336e-07, |
|
"loss": 0.0381, |
|
"reward": 0.19613806903362274, |
|
"reward_std": 0.18428167328238487, |
|
"rewards/cosine_scaled_reward": -0.10239327140152454, |
|
"rewards/format_reward": 0.9583333432674408, |
|
"step": 487 |
|
}, |
|
{ |
|
"completion_length": 1643.8750305175781, |
|
"epoch": 0.5577142857142857, |
|
"grad_norm": 2.999095916748047, |
|
"kl": 1.26953125, |
|
"learning_rate": 1.0157821333772304e-07, |
|
"loss": 0.0508, |
|
"reward": 0.14464829117059708, |
|
"reward_std": 0.164125744253397, |
|
"rewards/cosine_scaled_reward": -0.15228741243481636, |
|
"rewards/format_reward": 0.8541666865348816, |
|
"step": 488 |
|
}, |
|
{ |
|
"completion_length": 855.1250228881836, |
|
"epoch": 0.5588571428571428, |
|
"grad_norm": 2.9949464797973633, |
|
"kl": 0.30072021484375, |
|
"learning_rate": 1.013262614978859e-07, |
|
"loss": 0.012, |
|
"reward": 0.2884952202439308, |
|
"reward_std": 0.1300883013755083, |
|
"rewards/cosine_scaled_reward": 0.063195139169693, |
|
"rewards/format_reward": 0.9791666716337204, |
|
"step": 489 |
|
}, |
|
{ |
|
"completion_length": 1136.1250305175781, |
|
"epoch": 0.56, |
|
"grad_norm": 1.8440228700637817, |
|
"kl": 0.775390625, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.031, |
|
"reward": 0.17430318985134363, |
|
"reward_std": 0.1413884162902832, |
|
"rewards/cosine_scaled_reward": -0.11317074298858643, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 1581.7708587646484, |
|
"epoch": 0.5611428571428572, |
|
"grad_norm": 2.8860220909118652, |
|
"kl": 0.88128662109375, |
|
"learning_rate": 1.0088797220727779e-07, |
|
"loss": 0.0353, |
|
"reward": 0.18477380648255348, |
|
"reward_std": 0.16339224576950073, |
|
"rewards/cosine_scaled_reward": -0.032030028640292585, |
|
"rewards/format_reward": 0.770833358168602, |
|
"step": 491 |
|
}, |
|
{ |
|
"completion_length": 1146.4166717529297, |
|
"epoch": 0.5622857142857143, |
|
"grad_norm": 2.8887076377868652, |
|
"kl": 1.0263671875, |
|
"learning_rate": 1.0070165611810855e-07, |
|
"loss": 0.0411, |
|
"reward": 0.20537487417459488, |
|
"reward_std": 0.17985284700989723, |
|
"rewards/cosine_scaled_reward": -0.05676530674099922, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 492 |
|
}, |
|
{ |
|
"completion_length": 1219.9167175292969, |
|
"epoch": 0.5634285714285714, |
|
"grad_norm": 11.495869636535645, |
|
"kl": 0.35302734375, |
|
"learning_rate": 1.005372381963547e-07, |
|
"loss": 0.0141, |
|
"reward": 0.17675728350877762, |
|
"reward_std": 0.16118020564317703, |
|
"rewards/cosine_scaled_reward": -0.13073305413126945, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 493 |
|
}, |
|
{ |
|
"completion_length": 1401.2500305175781, |
|
"epoch": 0.5645714285714286, |
|
"grad_norm": 3.791532278060913, |
|
"kl": 1.544677734375, |
|
"learning_rate": 1.0039472645551372e-07, |
|
"loss": 0.0618, |
|
"reward": 0.12077869102358818, |
|
"reward_std": 0.1393336970359087, |
|
"rewards/cosine_scaled_reward": -0.20588408038020134, |
|
"rewards/format_reward": 0.8750000149011612, |
|
"step": 494 |
|
}, |
|
{ |
|
"completion_length": 1228.2708740234375, |
|
"epoch": 0.5657142857142857, |
|
"grad_norm": 2.0392343997955322, |
|
"kl": 0.72314453125, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": 0.0289, |
|
"reward": 0.2310006357729435, |
|
"reward_std": 0.14271893352270126, |
|
"rewards/cosine_scaled_reward": -0.00487779825925827, |
|
"rewards/format_reward": 0.895833358168602, |
|
"step": 495 |
|
}, |
|
{ |
|
"completion_length": 1170.8541870117188, |
|
"epoch": 0.5668571428571428, |
|
"grad_norm": 2.2888383865356445, |
|
"kl": 0.8193359375, |
|
"learning_rate": 1.0017544823184055e-07, |
|
"loss": 0.0328, |
|
"reward": 0.33169906958937645, |
|
"reward_std": 0.14619917422533035, |
|
"rewards/cosine_scaled_reward": 0.18171185720711946, |
|
"rewards/format_reward": 0.8958333432674408, |
|
"step": 496 |
|
}, |
|
{ |
|
"completion_length": 1481.7292175292969, |
|
"epoch": 0.568, |
|
"grad_norm": 2.517068386077881, |
|
"kl": 1.252685546875, |
|
"learning_rate": 1.0009869243631952e-07, |
|
"loss": 0.0501, |
|
"reward": 0.2681533806025982, |
|
"reward_std": 0.20135387405753136, |
|
"rewards/cosine_scaled_reward": 0.10805653966963291, |
|
"rewards/format_reward": 0.8125000149011612, |
|
"step": 497 |
|
}, |
|
{ |
|
"completion_length": 1335.375, |
|
"epoch": 0.5691428571428572, |
|
"grad_norm": 2.2976410388946533, |
|
"kl": 0.9261474609375, |
|
"learning_rate": 1.000438641958131e-07, |
|
"loss": 0.0372, |
|
"reward": 0.156329445540905, |
|
"reward_std": 0.1802426353096962, |
|
"rewards/cosine_scaled_reward": -0.16677019745111465, |
|
"rewards/format_reward": 0.9375000149011612, |
|
"step": 498 |
|
}, |
|
{ |
|
"completion_length": 1487.9583740234375, |
|
"epoch": 0.5702857142857143, |
|
"grad_norm": 1.1875653266906738, |
|
"kl": 0.619964599609375, |
|
"learning_rate": 1.0001096618257236e-07, |
|
"loss": 0.0248, |
|
"reward": 0.12715653842315078, |
|
"reward_std": 0.1571529433131218, |
|
"rewards/cosine_scaled_reward": -0.2139046173542738, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 499 |
|
}, |
|
{ |
|
"completion_length": 1511.5833740234375, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 3.7676918506622314, |
|
"kl": 1.2734375, |
|
"learning_rate": 1e-07, |
|
"loss": 0.051, |
|
"reward": 0.1403972152620554, |
|
"reward_std": 0.23328615352511406, |
|
"rewards/cosine_scaled_reward": -0.10787822678685188, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.014876242799652573, |
|
"train_runtime": 36292.0916, |
|
"train_samples_per_second": 0.661, |
|
"train_steps_per_second": 0.014 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|