OpenRS-GRPO-DPPv1 / trainer_state.json
xiwenc1's picture
Model save
89ed448 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 3001.9584350585938,
"epoch": 0.001142857142857143,
"grad_norm": 0.19055147469043732,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": -0.0,
"reward": -0.0029319413006305695,
"reward_std": 0.12454631552100182,
"rewards/cosine_scaled_reward": -0.1928562317043543,
"rewards/format_reward": 0.37500000558793545,
"step": 1
},
{
"completion_length": 2822.541717529297,
"epoch": 0.002285714285714286,
"grad_norm": 0.28548890352249146,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0,
"reward": 0.11451426180428825,
"reward_std": 0.2134026400744915,
"rewards/cosine_scaled_reward": -0.009885392151772976,
"rewards/format_reward": 0.4583333432674408,
"step": 2
},
{
"completion_length": 2819.25,
"epoch": 0.0034285714285714284,
"grad_norm": 0.1939578354358673,
"kl": 3.499537706375122e-05,
"learning_rate": 6e-08,
"loss": 0.0,
"reward": -0.07006961421575397,
"reward_std": 0.11589069850742817,
"rewards/cosine_scaled_reward": -0.29296013712882996,
"rewards/format_reward": 0.31250000186264515,
"step": 3
},
{
"completion_length": 2995.2501220703125,
"epoch": 0.004571428571428572,
"grad_norm": 0.21970215439796448,
"kl": 4.45246696472168e-05,
"learning_rate": 8e-08,
"loss": 0.0,
"reward": -0.03315656236372888,
"reward_std": 0.14162674359977245,
"rewards/cosine_scaled_reward": -0.20897372206673026,
"rewards/format_reward": 0.29166668094694614,
"step": 4
},
{
"completion_length": 2716.666748046875,
"epoch": 0.005714285714285714,
"grad_norm": 0.17376844584941864,
"kl": 2.6345252990722656e-05,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": 0.1103692501783371,
"reward_std": 0.14548173919320107,
"rewards/cosine_scaled_reward": 0.002536635845899582,
"rewards/format_reward": 0.41666667722165585,
"step": 5
},
{
"completion_length": 2795.229248046875,
"epoch": 0.006857142857142857,
"grad_norm": 0.20082038640975952,
"kl": 3.698468208312988e-05,
"learning_rate": 1.2e-07,
"loss": 0.0,
"reward": 0.07470211386680603,
"reward_std": 0.1657848320901394,
"rewards/cosine_scaled_reward": -0.08863592706620693,
"rewards/format_reward": 0.4583333432674408,
"step": 6
},
{
"completion_length": 2590.2084350585938,
"epoch": 0.008,
"grad_norm": 0.21514829993247986,
"kl": 2.5466084480285645e-05,
"learning_rate": 1.4e-07,
"loss": 0.0,
"reward": 0.1516738818027079,
"reward_std": 0.24778864905238152,
"rewards/cosine_scaled_reward": 0.03323611244559288,
"rewards/format_reward": 0.520833358168602,
"step": 7
},
{
"completion_length": 2860.7708740234375,
"epoch": 0.009142857142857144,
"grad_norm": 0.18623042106628418,
"kl": 3.412365913391113e-05,
"learning_rate": 1.6e-07,
"loss": 0.0,
"reward": 0.005636372021399438,
"reward_std": 0.18859010189771652,
"rewards/cosine_scaled_reward": -0.21989555237814784,
"rewards/format_reward": 0.4583333432674408,
"step": 8
},
{
"completion_length": 3180.3541870117188,
"epoch": 0.010285714285714285,
"grad_norm": 0.18793436884880066,
"kl": 3.8176774978637695e-05,
"learning_rate": 1.8e-07,
"loss": 0.0,
"reward": -0.005034097470343113,
"reward_std": 0.17726320587098598,
"rewards/cosine_scaled_reward": -0.13775646989233792,
"rewards/format_reward": 0.2500000149011612,
"step": 9
},
{
"completion_length": 2277.3750915527344,
"epoch": 0.011428571428571429,
"grad_norm": 0.24739593267440796,
"kl": 3.692507743835449e-05,
"learning_rate": 2e-07,
"loss": 0.0,
"reward": 0.0970942941494286,
"reward_std": 0.1597892940044403,
"rewards/cosine_scaled_reward": -0.11606822581961751,
"rewards/format_reward": 0.604166679084301,
"step": 10
},
{
"completion_length": 2501.1250610351562,
"epoch": 0.012571428571428572,
"grad_norm": 0.29089272022247314,
"kl": 3.090500831604004e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0,
"reward": 0.14959498681128025,
"reward_std": 0.26211751252412796,
"rewards/cosine_scaled_reward": -0.0010929219424724579,
"rewards/format_reward": 0.5833333656191826,
"step": 11
},
{
"completion_length": 2632.9375,
"epoch": 0.013714285714285714,
"grad_norm": 0.23229414224624634,
"kl": 3.573298454284668e-05,
"learning_rate": 2.4e-07,
"loss": 0.0,
"reward": 0.1019544918090105,
"reward_std": 0.23751188814640045,
"rewards/cosine_scaled_reward": -0.04352016560733318,
"rewards/format_reward": 0.4791666716337204,
"step": 12
},
{
"completion_length": 2213.3125610351562,
"epoch": 0.014857142857142857,
"grad_norm": 0.1961122304201126,
"kl": 2.2172927856445312e-05,
"learning_rate": 2.6e-07,
"loss": 0.0,
"reward": 0.17769552022218704,
"reward_std": 0.1783296838402748,
"rewards/cosine_scaled_reward": 0.042532917112112045,
"rewards/format_reward": 0.6041666716337204,
"step": 13
},
{
"completion_length": 2936.5625610351562,
"epoch": 0.016,
"grad_norm": 0.19352266192436218,
"kl": 3.9771199226379395e-05,
"learning_rate": 2.8e-07,
"loss": 0.0,
"reward": 0.0035927146673202515,
"reward_std": 0.15151787921786308,
"rewards/cosine_scaled_reward": -0.17052022088319063,
"rewards/format_reward": 0.35416667722165585,
"step": 14
},
{
"completion_length": 3146.5416870117188,
"epoch": 0.017142857142857144,
"grad_norm": 0.195539191365242,
"kl": 3.9458274841308594e-05,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.045046235201880336,
"reward_std": 0.2458735667169094,
"rewards/cosine_scaled_reward": -0.03827371634542942,
"rewards/format_reward": 0.2500000074505806,
"step": 15
},
{
"completion_length": 2227.1875610351562,
"epoch": 0.018285714285714287,
"grad_norm": 0.2652665972709656,
"kl": 1.9550323486328125e-05,
"learning_rate": 3.2e-07,
"loss": 0.0,
"reward": 0.21294382214546204,
"reward_std": 0.22787468880414963,
"rewards/cosine_scaled_reward": 0.13755386415868998,
"rewards/format_reward": 0.5416666865348816,
"step": 16
},
{
"completion_length": 3370.7708740234375,
"epoch": 0.019428571428571427,
"grad_norm": 0.21129053831100464,
"kl": 4.503130912780762e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"reward": -0.024287598207592964,
"reward_std": 0.14709143340587616,
"rewards/cosine_scaled_reward": -0.15107670798897743,
"rewards/format_reward": 0.2083333432674408,
"step": 17
},
{
"completion_length": 2967.5625610351562,
"epoch": 0.02057142857142857,
"grad_norm": 0.2422505021095276,
"kl": 4.3272972106933594e-05,
"learning_rate": 3.6e-07,
"loss": 0.0,
"reward": -0.025027030613273382,
"reward_std": 0.17510299384593964,
"rewards/cosine_scaled_reward": -0.17359089059755206,
"rewards/format_reward": 0.25000000558793545,
"step": 18
},
{
"completion_length": 3309.6041870117188,
"epoch": 0.021714285714285714,
"grad_norm": 0.16750593483448029,
"kl": 3.74913215637207e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0,
"reward": 0.054348187521100044,
"reward_std": 0.26255496218800545,
"rewards/cosine_scaled_reward": -0.041149500757455826,
"rewards/format_reward": 0.291666679084301,
"step": 19
},
{
"completion_length": 2639.2083740234375,
"epoch": 0.022857142857142857,
"grad_norm": 0.2358766347169876,
"kl": 3.248453140258789e-05,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.09739121049642563,
"reward_std": 0.15525865368545055,
"rewards/cosine_scaled_reward": -0.07560409791767597,
"rewards/format_reward": 0.520833358168602,
"step": 20
},
{
"completion_length": 2620.1875610351562,
"epoch": 0.024,
"grad_norm": 0.23142506182193756,
"kl": 3.2573938369750977e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0,
"reward": 0.06687073037028313,
"reward_std": 0.10379723459482193,
"rewards/cosine_scaled_reward": -0.06918285926803946,
"rewards/format_reward": 0.39583333395421505,
"step": 21
},
{
"completion_length": 3418.6250610351562,
"epoch": 0.025142857142857144,
"grad_norm": 0.23349517583847046,
"kl": 3.2335519790649414e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"reward": -0.09891281835734844,
"reward_std": 0.16107076033949852,
"rewards/cosine_scaled_reward": -0.24483727663755417,
"rewards/format_reward": 0.10416666977107525,
"step": 22
},
{
"completion_length": 3000.4168090820312,
"epoch": 0.026285714285714287,
"grad_norm": 0.22397248446941376,
"kl": 4.8041343688964844e-05,
"learning_rate": 4.6e-07,
"loss": 0.0,
"reward": 0.06495004217140377,
"reward_std": 0.19814502075314522,
"rewards/cosine_scaled_reward": -0.09647991880774498,
"rewards/format_reward": 0.43750001303851604,
"step": 23
},
{
"completion_length": 2132.4583740234375,
"epoch": 0.027428571428571427,
"grad_norm": 0.2273954451084137,
"kl": 1.659989356994629e-05,
"learning_rate": 4.8e-07,
"loss": 0.0,
"reward": 0.19651076383888721,
"reward_std": 0.20518572628498077,
"rewards/cosine_scaled_reward": 0.04420430213212967,
"rewards/format_reward": 0.666666679084301,
"step": 24
},
{
"completion_length": 2864.8125610351562,
"epoch": 0.02857142857142857,
"grad_norm": 0.2530263364315033,
"kl": 2.9861927032470703e-05,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 0.09658885933458805,
"reward_std": 0.15381062403321266,
"rewards/cosine_scaled_reward": -0.010462287813425064,
"rewards/format_reward": 0.3958333395421505,
"step": 25
},
{
"completion_length": 3151.791748046875,
"epoch": 0.029714285714285714,
"grad_norm": 0.15439528226852417,
"kl": 2.0995736122131348e-05,
"learning_rate": 5.2e-07,
"loss": 0.0,
"reward": 0.020544751780107617,
"reward_std": 0.16202639788389206,
"rewards/cosine_scaled_reward": -0.07661792263388634,
"rewards/format_reward": 0.229166679084301,
"step": 26
},
{
"completion_length": 2976.416748046875,
"epoch": 0.030857142857142857,
"grad_norm": 0.20570415258407593,
"kl": 2.8975307941436768e-05,
"learning_rate": 5.4e-07,
"loss": 0.0,
"reward": 0.01607461948879063,
"reward_std": 0.12287303619086742,
"rewards/cosine_scaled_reward": -0.10305411368608475,
"rewards/format_reward": 0.27083333395421505,
"step": 27
},
{
"completion_length": 3119.3125610351562,
"epoch": 0.032,
"grad_norm": 0.21759092807769775,
"kl": 3.6269426345825195e-05,
"learning_rate": 5.6e-07,
"loss": 0.0,
"reward": 0.045295797288417816,
"reward_std": 0.12472150847315788,
"rewards/cosine_scaled_reward": -0.06871754361782223,
"rewards/format_reward": 0.31250000558793545,
"step": 28
},
{
"completion_length": 3055.291748046875,
"epoch": 0.03314285714285714,
"grad_norm": 0.17530092597007751,
"kl": 1.576542854309082e-05,
"learning_rate": 5.8e-07,
"loss": 0.0,
"reward": 0.03692814148962498,
"reward_std": 0.2673305310308933,
"rewards/cosine_scaled_reward": -0.08717780001461506,
"rewards/format_reward": 0.31250001676380634,
"step": 29
},
{
"completion_length": 3088.9375610351562,
"epoch": 0.03428571428571429,
"grad_norm": 0.23589830100536346,
"kl": 2.016127109527588e-05,
"learning_rate": 6e-07,
"loss": 0.0,
"reward": 0.039040276780724525,
"reward_std": 0.18401793204247952,
"rewards/cosine_scaled_reward": -0.11255598999559879,
"rewards/format_reward": 0.37500001303851604,
"step": 30
},
{
"completion_length": 2690.041748046875,
"epoch": 0.03542857142857143,
"grad_norm": 0.25512003898620605,
"kl": 2.226606011390686e-05,
"learning_rate": 6.2e-07,
"loss": 0.0,
"reward": 0.02995466347783804,
"reward_std": 0.1407727226614952,
"rewards/cosine_scaled_reward": -0.1338973045349121,
"rewards/format_reward": 0.37500001303851604,
"step": 31
},
{
"completion_length": 3518.2916870117188,
"epoch": 0.036571428571428574,
"grad_norm": 0.15214499831199646,
"kl": 1.8969178199768066e-05,
"learning_rate": 6.4e-07,
"loss": 0.0,
"reward": -0.052682604640722275,
"reward_std": 0.19048137590289116,
"rewards/cosine_scaled_reward": -0.1558831539005041,
"rewards/format_reward": 0.10416666977107525,
"step": 32
},
{
"completion_length": 2984.3541870117188,
"epoch": 0.037714285714285714,
"grad_norm": 0.19921286404132843,
"kl": 2.2009015083312988e-05,
"learning_rate": 6.6e-07,
"loss": 0.0,
"reward": 0.06904905498959124,
"reward_std": 0.18269503861665726,
"rewards/cosine_scaled_reward": -0.08260507695376873,
"rewards/format_reward": 0.43750002048909664,
"step": 33
},
{
"completion_length": 3051.0208435058594,
"epoch": 0.038857142857142854,
"grad_norm": 0.19640006124973297,
"kl": 2.3230910301208496e-05,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0,
"reward": -0.025606604642234743,
"reward_std": 0.23111771792173386,
"rewards/cosine_scaled_reward": -0.17444449942559004,
"rewards/format_reward": 0.25000000558793545,
"step": 34
},
{
"completion_length": 2780.9376220703125,
"epoch": 0.04,
"grad_norm": 0.18088482320308685,
"kl": 2.244114875793457e-05,
"learning_rate": 7e-07,
"loss": 0.0,
"reward": 0.10355074889957905,
"reward_std": 0.20479629933834076,
"rewards/cosine_scaled_reward": -0.0417685154825449,
"rewards/format_reward": 0.47916668467223644,
"step": 35
},
{
"completion_length": 2462.7083740234375,
"epoch": 0.04114285714285714,
"grad_norm": 0.18930543959140778,
"kl": 5.8710575103759766e-05,
"learning_rate": 7.2e-07,
"loss": 0.0,
"reward": 0.1567097045481205,
"reward_std": 0.12252922169864178,
"rewards/cosine_scaled_reward": 0.0724838562309742,
"rewards/format_reward": 0.4583333432674408,
"step": 36
},
{
"completion_length": 2707.2500915527344,
"epoch": 0.04228571428571429,
"grad_norm": 0.20084795355796814,
"kl": 4.0590763092041016e-05,
"learning_rate": 7.4e-07,
"loss": 0.0,
"reward": 0.13954784779343754,
"reward_std": 0.1899961344897747,
"rewards/cosine_scaled_reward": 0.006025645881891251,
"rewards/format_reward": 0.5208333488553762,
"step": 37
},
{
"completion_length": 2946.6250610351562,
"epoch": 0.04342857142857143,
"grad_norm": 0.2238946110010147,
"kl": 5.5283308029174805e-05,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0,
"reward": -0.01690117083489895,
"reward_std": 0.18682749196887016,
"rewards/cosine_scaled_reward": -0.20232924073934555,
"rewards/format_reward": 0.3333333469927311,
"step": 38
},
{
"completion_length": 2901.9584350585938,
"epoch": 0.044571428571428574,
"grad_norm": 0.22113607823848724,
"kl": 5.8710575103759766e-05,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0,
"reward": 0.09138605836778879,
"reward_std": 0.25620007514953613,
"rewards/cosine_scaled_reward": -0.05268890131264925,
"rewards/format_reward": 0.4583333544433117,
"step": 39
},
{
"completion_length": 3042.3334350585938,
"epoch": 0.045714285714285714,
"grad_norm": 0.17167918384075165,
"kl": 3.018975257873535e-05,
"learning_rate": 8e-07,
"loss": 0.0,
"reward": 0.025605826638638973,
"reward_std": 0.17767371982336044,
"rewards/cosine_scaled_reward": -0.11477911379188299,
"rewards/format_reward": 0.3333333395421505,
"step": 40
},
{
"completion_length": 3265.6875610351562,
"epoch": 0.046857142857142854,
"grad_norm": 0.15757694840431213,
"kl": 1.1652708053588867e-05,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0,
"reward": 0.032783683855086565,
"reward_std": 0.13638111762702465,
"rewards/cosine_scaled_reward": -0.07333838939666748,
"rewards/format_reward": 0.2708333358168602,
"step": 41
},
{
"completion_length": 2019.4584350585938,
"epoch": 0.048,
"grad_norm": 0.2902780771255493,
"kl": 0.00019973516464233398,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0,
"reward": 0.2274437490850687,
"reward_std": 0.20474949106574059,
"rewards/cosine_scaled_reward": 0.08997760340571404,
"rewards/format_reward": 0.6875000149011612,
"step": 42
},
{
"completion_length": 3191.1250610351562,
"epoch": 0.04914285714285714,
"grad_norm": 0.199421688914299,
"kl": 3.5726698115468025e-05,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0,
"reward": 0.00293779862113297,
"reward_std": 0.2170444093644619,
"rewards/cosine_scaled_reward": -0.1411289218813181,
"rewards/format_reward": 0.291666679084301,
"step": 43
},
{
"completion_length": 2911.6875610351562,
"epoch": 0.05028571428571429,
"grad_norm": 0.20561084151268005,
"kl": 0.0003883242607116699,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"reward": 0.06697382591664791,
"reward_std": 0.14113801531493664,
"rewards/cosine_scaled_reward": -0.07008513808250427,
"rewards/format_reward": 0.3958333432674408,
"step": 44
},
{
"completion_length": 2605.666717529297,
"epoch": 0.05142857142857143,
"grad_norm": 0.18309232592582703,
"kl": 4.357099533081055e-05,
"learning_rate": 9e-07,
"loss": 0.0,
"reward": 0.2496887871529907,
"reward_std": 0.23000844940543175,
"rewards/cosine_scaled_reward": 0.19401046447455883,
"rewards/format_reward": 0.5833333507180214,
"step": 45
},
{
"completion_length": 2926.604217529297,
"epoch": 0.052571428571428575,
"grad_norm": 0.1925714910030365,
"kl": 0.00014609098434448242,
"learning_rate": 9.2e-07,
"loss": 0.0,
"reward": 0.028744973242282867,
"reward_std": 0.1495701353996992,
"rewards/cosine_scaled_reward": -0.10348123731091619,
"rewards/format_reward": 0.3125,
"step": 46
},
{
"completion_length": 2612.3125915527344,
"epoch": 0.053714285714285714,
"grad_norm": 0.19961993396282196,
"kl": 9.500980377197266e-05,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0,
"reward": 0.08338814397575334,
"reward_std": 0.19680871814489365,
"rewards/cosine_scaled_reward": -0.08348039817065,
"rewards/format_reward": 0.4791666716337204,
"step": 47
},
{
"completion_length": 2722.3959350585938,
"epoch": 0.054857142857142854,
"grad_norm": 0.21567687392234802,
"kl": 9.578466415405273e-05,
"learning_rate": 9.6e-07,
"loss": 0.0,
"reward": 0.09677822515368462,
"reward_std": 0.2861610949039459,
"rewards/cosine_scaled_reward": -0.04573226906359196,
"rewards/format_reward": 0.45833334885537624,
"step": 48
},
{
"completion_length": 2226.354217529297,
"epoch": 0.056,
"grad_norm": 0.38961726427078247,
"kl": 0.000255584716796875,
"learning_rate": 9.8e-07,
"loss": 0.0,
"reward": 0.15269484370946884,
"reward_std": 0.18540722876787186,
"rewards/cosine_scaled_reward": -0.019495231565088034,
"rewards/format_reward": 0.625,
"step": 49
},
{
"completion_length": 2191.729217529297,
"epoch": 0.05714285714285714,
"grad_norm": 0.25581982731819153,
"kl": 0.000471651554107666,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.1666366644203663,
"reward_std": 0.16899916529655457,
"rewards/cosine_scaled_reward": 0.057675519958138466,
"rewards/format_reward": 0.520833358168602,
"step": 50
},
{
"completion_length": 3260.0416870117188,
"epoch": 0.05828571428571429,
"grad_norm": 0.2235163003206253,
"kl": 0.00012612342834472656,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0,
"reward": -0.0477819973602891,
"reward_std": 0.16141689382493496,
"rewards/cosine_scaled_reward": -0.20123709551990032,
"rewards/format_reward": 0.2083333395421505,
"step": 51
},
{
"completion_length": 3330.979248046875,
"epoch": 0.05942857142857143,
"grad_norm": 0.176573246717453,
"kl": 0.00023108720779418945,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0,
"reward": -0.0365308066830039,
"reward_std": 0.14085101708769798,
"rewards/cosine_scaled_reward": -0.1649935580790043,
"rewards/format_reward": 0.18750000558793545,
"step": 52
},
{
"completion_length": 3004.7083740234375,
"epoch": 0.060571428571428575,
"grad_norm": 0.20955270528793335,
"kl": 0.00027896463871002197,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0,
"reward": -0.003909507766366005,
"reward_std": 0.11847533471882343,
"rewards/cosine_scaled_reward": -0.14490897953510284,
"rewards/format_reward": 0.2708333395421505,
"step": 53
},
{
"completion_length": 2624.6875610351562,
"epoch": 0.061714285714285715,
"grad_norm": 0.21326404809951782,
"kl": 0.00024962425231933594,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0,
"reward": -0.006349522154778242,
"reward_std": 0.10255092941224575,
"rewards/cosine_scaled_reward": -0.21963607892394066,
"rewards/format_reward": 0.4166666865348816,
"step": 54
},
{
"completion_length": 3092.479248046875,
"epoch": 0.06285714285714286,
"grad_norm": 0.16402700543403625,
"kl": 0.00016164779663085938,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0,
"reward": -0.02835557982325554,
"reward_std": 0.16623086854815483,
"rewards/cosine_scaled_reward": -0.1794952228665352,
"rewards/format_reward": 0.25000000558793545,
"step": 55
},
{
"completion_length": 3083.1458740234375,
"epoch": 0.064,
"grad_norm": 0.22414982318878174,
"kl": 0.0007500648498535156,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0,
"reward": 0.025544505566358566,
"reward_std": 0.20646458864212036,
"rewards/cosine_scaled_reward": -0.08164806384593248,
"rewards/format_reward": 0.27083333395421505,
"step": 56
},
{
"completion_length": 3015.5000610351562,
"epoch": 0.06514285714285714,
"grad_norm": 0.26501360535621643,
"kl": 0.0005974769592285156,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0,
"reward": 0.0491860609035939,
"reward_std": 0.1841455027461052,
"rewards/cosine_scaled_reward": -0.09901990741491318,
"rewards/format_reward": 0.39583333395421505,
"step": 57
},
{
"completion_length": 3105.1666870117188,
"epoch": 0.06628571428571428,
"grad_norm": 0.1655452996492386,
"kl": 0.0008473992347717285,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0,
"reward": -0.05875010509043932,
"reward_std": 0.08936690539121628,
"rewards/cosine_scaled_reward": -0.23779355734586716,
"rewards/format_reward": 0.25,
"step": 58
},
{
"completion_length": 2726.291748046875,
"epoch": 0.06742857142857143,
"grad_norm": 0.2481091469526291,
"kl": 0.0003948211669921875,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0,
"reward": 0.1814795120153576,
"reward_std": 0.23579821549355984,
"rewards/cosine_scaled_reward": 0.11390005052089691,
"rewards/format_reward": 0.4791666679084301,
"step": 59
},
{
"completion_length": 2841.291748046875,
"epoch": 0.06857142857142857,
"grad_norm": 0.196300208568573,
"kl": 0.0006402730941772461,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0,
"reward": 0.09838544577360153,
"reward_std": 0.1272505521774292,
"rewards/cosine_scaled_reward": 0.030779220163822174,
"rewards/format_reward": 0.31250000186264515,
"step": 60
},
{
"completion_length": 2196.8333740234375,
"epoch": 0.06971428571428571,
"grad_norm": 0.1789156198501587,
"kl": 0.0003542900085449219,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0,
"reward": 0.2810430023819208,
"reward_std": 0.11581777688115835,
"rewards/cosine_scaled_reward": 0.23955225199460983,
"rewards/format_reward": 0.6041666716337204,
"step": 61
},
{
"completion_length": 3327.354248046875,
"epoch": 0.07085714285714285,
"grad_norm": 0.15294106304645538,
"kl": 0.00029337406158447266,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0,
"reward": 0.07980241999030113,
"reward_std": 0.2444281317293644,
"rewards/cosine_scaled_reward": -0.01411302387714386,
"rewards/format_reward": 0.33333334140479565,
"step": 62
},
{
"completion_length": 2406.1875610351562,
"epoch": 0.072,
"grad_norm": 0.18347583711147308,
"kl": 0.0021266937255859375,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0001,
"reward": 0.2145740818232298,
"reward_std": 0.17063674330711365,
"rewards/cosine_scaled_reward": 0.11840518936514854,
"rewards/format_reward": 0.583333358168602,
"step": 63
},
{
"completion_length": 2985.166748046875,
"epoch": 0.07314285714285715,
"grad_norm": 0.16115514934062958,
"kl": 0.00035321712493896484,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0,
"reward": 0.11250189319252968,
"reward_std": 0.17308304831385612,
"rewards/cosine_scaled_reward": 0.03135997918434441,
"rewards/format_reward": 0.3750000149011612,
"step": 64
},
{
"completion_length": 2662.2083740234375,
"epoch": 0.07428571428571429,
"grad_norm": 0.19161196053028107,
"kl": 0.000934600830078125,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0,
"reward": 0.18816682742908597,
"reward_std": 0.19341924414038658,
"rewards/cosine_scaled_reward": 0.11808005906641483,
"rewards/format_reward": 0.5000000149011612,
"step": 65
},
{
"completion_length": 3121.8125610351562,
"epoch": 0.07542857142857143,
"grad_norm": 0.15908025205135345,
"kl": 0.0005314350128173828,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0,
"reward": 0.07345604291185737,
"reward_std": 0.16167625226080418,
"rewards/cosine_scaled_reward": -0.04804755933582783,
"rewards/format_reward": 0.37500000186264515,
"step": 66
},
{
"completion_length": 2960.312530517578,
"epoch": 0.07657142857142857,
"grad_norm": 0.19106991589069366,
"kl": 0.0009171962738037109,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0,
"reward": 0.11973061971366405,
"reward_std": 0.23938697017729282,
"rewards/cosine_scaled_reward": 0.04380590561777353,
"rewards/format_reward": 0.37500001676380634,
"step": 67
},
{
"completion_length": 2512.5625610351562,
"epoch": 0.07771428571428571,
"grad_norm": 0.27955880761146545,
"kl": 0.0024480819702148438,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0001,
"reward": 0.032972510904073715,
"reward_std": 0.16721198707818985,
"rewards/cosine_scaled_reward": -0.14768926287069917,
"rewards/format_reward": 0.41666667722165585,
"step": 68
},
{
"completion_length": 2698.6250915527344,
"epoch": 0.07885714285714286,
"grad_norm": 0.1807398796081543,
"kl": 0.0006427764892578125,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0,
"reward": 0.14363489238894545,
"reward_std": 0.17969760112464428,
"rewards/cosine_scaled_reward": 0.050120849162340164,
"rewards/format_reward": 0.4583333358168602,
"step": 69
},
{
"completion_length": 3165.1458740234375,
"epoch": 0.08,
"grad_norm": 0.17745569348335266,
"kl": 0.0035309791564941406,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0001,
"reward": 0.015050832647830248,
"reward_std": 0.14714835956692696,
"rewards/cosine_scaled_reward": -0.1469174176454544,
"rewards/format_reward": 0.3541666716337204,
"step": 70
},
{
"completion_length": 2699.7709045410156,
"epoch": 0.08114285714285714,
"grad_norm": 0.22869746387004852,
"kl": 0.00305938720703125,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0001,
"reward": 0.04336157673969865,
"reward_std": 0.17576204612851143,
"rewards/cosine_scaled_reward": -0.11572509631514549,
"rewards/format_reward": 0.3958333469927311,
"step": 71
},
{
"completion_length": 2606.291748046875,
"epoch": 0.08228571428571428,
"grad_norm": 0.2361251413822174,
"kl": 0.0014438629150390625,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0001,
"reward": 0.2736402824521065,
"reward_std": 0.2642326597124338,
"rewards/cosine_scaled_reward": 0.2026251358911395,
"rewards/format_reward": 0.6458333432674408,
"step": 72
},
{
"completion_length": 2170.729248046875,
"epoch": 0.08342857142857144,
"grad_norm": 0.25033605098724365,
"kl": 0.0022487640380859375,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0001,
"reward": 0.265652135014534,
"reward_std": 0.19880715385079384,
"rewards/cosine_scaled_reward": 0.17191709205508232,
"rewards/format_reward": 0.6875000149011612,
"step": 73
},
{
"completion_length": 2448.6251220703125,
"epoch": 0.08457142857142858,
"grad_norm": 0.2048778533935547,
"kl": 0.0019674301147460938,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0001,
"reward": 0.08278486505150795,
"reward_std": 0.1648626308888197,
"rewards/cosine_scaled_reward": -0.10248487256467342,
"rewards/format_reward": 0.5208333432674408,
"step": 74
},
{
"completion_length": 2838.2500610351562,
"epoch": 0.08571428571428572,
"grad_norm": 0.2654828727245331,
"kl": 0.0029697418212890625,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0001,
"reward": 0.06769379088655114,
"reward_std": 0.24208112806081772,
"rewards/cosine_scaled_reward": -0.04729684395715594,
"rewards/format_reward": 0.3541666679084301,
"step": 75
},
{
"completion_length": 2911.5209350585938,
"epoch": 0.08685714285714285,
"grad_norm": 0.17353959381580353,
"kl": 0.0006062984466552734,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0,
"reward": 0.060059760231524706,
"reward_std": 0.19824261032044888,
"rewards/cosine_scaled_reward": -0.07150101102888584,
"rewards/format_reward": 0.37500000558793545,
"step": 76
},
{
"completion_length": 3174.6875610351562,
"epoch": 0.088,
"grad_norm": 0.15684625506401062,
"kl": 0.00110626220703125,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0,
"reward": 0.03418249450623989,
"reward_std": 0.17063240334391594,
"rewards/cosine_scaled_reward": -0.05116889998316765,
"rewards/format_reward": 0.22916666977107525,
"step": 77
},
{
"completion_length": 2902.9375,
"epoch": 0.08914285714285715,
"grad_norm": 0.209213986992836,
"kl": 0.0010857582092285156,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0,
"reward": 0.10875159315764904,
"reward_std": 0.13933855667710304,
"rewards/cosine_scaled_reward": -0.0003913678228855133,
"rewards/format_reward": 0.4166666828095913,
"step": 78
},
{
"completion_length": 2852.0416870117188,
"epoch": 0.09028571428571429,
"grad_norm": 0.19134745001792908,
"kl": 0.0005846023559570312,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0,
"reward": 0.12408905290067196,
"reward_std": 0.227988138794899,
"rewards/cosine_scaled_reward": 0.002291955053806305,
"rewards/format_reward": 0.479166679084301,
"step": 79
},
{
"completion_length": 3436.8125,
"epoch": 0.09142857142857143,
"grad_norm": 0.15251587331295013,
"kl": 0.0006704330444335938,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0,
"reward": -0.02238150453194976,
"reward_std": 0.1688902247697115,
"rewards/cosine_scaled_reward": -0.12677161488682032,
"rewards/format_reward": 0.1666666716337204,
"step": 80
},
{
"completion_length": 3266.9375610351562,
"epoch": 0.09257142857142857,
"grad_norm": 0.17284975945949554,
"kl": 0.0026865005493164062,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0001,
"reward": 0.0007507335394620895,
"reward_std": 0.1353786587715149,
"rewards/cosine_scaled_reward": -0.09297612681984901,
"rewards/format_reward": 0.18750000186264515,
"step": 81
},
{
"completion_length": 3086.416748046875,
"epoch": 0.09371428571428571,
"grad_norm": 0.18978342413902283,
"kl": 0.0010721683502197266,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0,
"reward": 0.06928094290196896,
"reward_std": 0.18230855278670788,
"rewards/cosine_scaled_reward": -0.0865684850141406,
"rewards/format_reward": 0.4375000111758709,
"step": 82
},
{
"completion_length": 3473.8958740234375,
"epoch": 0.09485714285714286,
"grad_norm": 0.17331495881080627,
"kl": 0.0004067420959472656,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0,
"reward": -0.11179337278008461,
"reward_std": 0.1241249330341816,
"rewards/cosine_scaled_reward": -0.2696155607700348,
"rewards/format_reward": 0.1041666679084301,
"step": 83
},
{
"completion_length": 2710.541748046875,
"epoch": 0.096,
"grad_norm": 0.18735957145690918,
"kl": 0.00074005126953125,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0,
"reward": 0.2588787730783224,
"reward_std": 0.20492861978709698,
"rewards/cosine_scaled_reward": 0.19406858971342444,
"rewards/format_reward": 0.604166679084301,
"step": 84
},
{
"completion_length": 3282.8333740234375,
"epoch": 0.09714285714285714,
"grad_norm": 0.1622576266527176,
"kl": 0.00115203857421875,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0,
"reward": 0.07870295969769359,
"reward_std": 0.12834673561155796,
"rewards/cosine_scaled_reward": -0.0030822306871414185,
"rewards/format_reward": 0.3125000074505806,
"step": 85
},
{
"completion_length": 2859.2083740234375,
"epoch": 0.09828571428571428,
"grad_norm": 0.20539958775043488,
"kl": 0.0013775825500488281,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0001,
"reward": 0.020047522732056677,
"reward_std": 0.17210980132222176,
"rewards/cosine_scaled_reward": -0.14074895903468132,
"rewards/format_reward": 0.3541666679084301,
"step": 86
},
{
"completion_length": 3006.3125610351562,
"epoch": 0.09942857142857142,
"grad_norm": 0.19360637664794922,
"kl": 0.0025768280029296875,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0001,
"reward": -0.005380205810070038,
"reward_std": 0.16785390488803387,
"rewards/cosine_scaled_reward": -0.1755614336580038,
"rewards/format_reward": 0.3333333395421505,
"step": 87
},
{
"completion_length": 2985.729248046875,
"epoch": 0.10057142857142858,
"grad_norm": 0.17934949696063995,
"kl": 0.0023851394653320312,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0001,
"reward": 0.13574134244117886,
"reward_std": 0.21831882745027542,
"rewards/cosine_scaled_reward": 0.0448464211076498,
"rewards/format_reward": 0.4375000074505806,
"step": 88
},
{
"completion_length": 3279.5208740234375,
"epoch": 0.10171428571428572,
"grad_norm": 0.17610202729701996,
"kl": 0.005316257476806641,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0002,
"reward": 0.08696338161826134,
"reward_std": 0.19547893106937408,
"rewards/cosine_scaled_reward": 0.030678212642669678,
"rewards/format_reward": 0.27083334140479565,
"step": 89
},
{
"completion_length": 3082.6875,
"epoch": 0.10285714285714286,
"grad_norm": 0.1900225430727005,
"kl": 0.001110076904296875,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0,
"reward": 0.04298854619264603,
"reward_std": 0.20502052083611488,
"rewards/cosine_scaled_reward": -0.06164951249957085,
"rewards/format_reward": 0.2916666828095913,
"step": 90
},
{
"completion_length": 3002.5833740234375,
"epoch": 0.104,
"grad_norm": 0.18745382130146027,
"kl": 0.0011818408966064453,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0,
"reward": 0.0811410085298121,
"reward_std": 0.1805788390338421,
"rewards/cosine_scaled_reward": -0.05142554081976414,
"rewards/format_reward": 0.41666667349636555,
"step": 91
},
{
"completion_length": 2689.104248046875,
"epoch": 0.10514285714285715,
"grad_norm": 0.19328680634498596,
"kl": 0.001689910888671875,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0001,
"reward": 0.10821354016661644,
"reward_std": 0.1421743929386139,
"rewards/cosine_scaled_reward": -0.05042751878499985,
"rewards/format_reward": 0.520833358168602,
"step": 92
},
{
"completion_length": 2476.979217529297,
"epoch": 0.10628571428571429,
"grad_norm": 0.21367445588111877,
"kl": 0.0015621185302734375,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0001,
"reward": 0.14456679113209248,
"reward_std": 0.1662643477320671,
"rewards/cosine_scaled_reward": 0.03526845946907997,
"rewards/format_reward": 0.47916667722165585,
"step": 93
},
{
"completion_length": 3352.0208740234375,
"epoch": 0.10742857142857143,
"grad_norm": 0.20487773418426514,
"kl": 0.00179290771484375,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0001,
"reward": 0.04233929002657533,
"reward_std": 0.1873803436756134,
"rewards/cosine_scaled_reward": -0.0499027743935585,
"rewards/format_reward": 0.2708333469927311,
"step": 94
},
{
"completion_length": 3058.8125,
"epoch": 0.10857142857142857,
"grad_norm": 0.2256304770708084,
"kl": 0.0010938644409179688,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0,
"reward": -0.011163771152496338,
"reward_std": 0.1628007385879755,
"rewards/cosine_scaled_reward": -0.16808456648141146,
"rewards/format_reward": 0.2916666716337204,
"step": 95
},
{
"completion_length": 2390.4166870117188,
"epoch": 0.10971428571428571,
"grad_norm": 0.20839492976665497,
"kl": 0.001537322998046875,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0001,
"reward": 0.08394679566845298,
"reward_std": 0.1445230431854725,
"rewards/cosine_scaled_reward": -0.0893700122833252,
"rewards/format_reward": 0.5000000149011612,
"step": 96
},
{
"completion_length": 2910.2709350585938,
"epoch": 0.11085714285714286,
"grad_norm": 0.20374347269535065,
"kl": 0.001979351043701172,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0001,
"reward": 0.09348384477198124,
"reward_std": 0.1994321532547474,
"rewards/cosine_scaled_reward": 0.0048431046307086945,
"rewards/format_reward": 0.33333333395421505,
"step": 97
},
{
"completion_length": 2694.2083740234375,
"epoch": 0.112,
"grad_norm": 0.26260530948638916,
"kl": 0.008546829223632812,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0003,
"reward": 0.06050444394350052,
"reward_std": 0.16644578985869884,
"rewards/cosine_scaled_reward": -0.08548790030181408,
"rewards/format_reward": 0.3958333432674408,
"step": 98
},
{
"completion_length": 2938.6875,
"epoch": 0.11314285714285714,
"grad_norm": 0.22185884416103363,
"kl": 0.001251220703125,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0001,
"reward": 0.12049873173236847,
"reward_std": 0.12151942402124405,
"rewards/cosine_scaled_reward": 0.03361584059894085,
"rewards/format_reward": 0.3958333544433117,
"step": 99
},
{
"completion_length": 3105.5834350585938,
"epoch": 0.11428571428571428,
"grad_norm": 0.18423013389110565,
"kl": 0.0013895034790039062,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0001,
"reward": 0.025272036204114556,
"reward_std": 0.20255185291171074,
"rewards/cosine_scaled_reward": -0.10323571693152189,
"rewards/format_reward": 0.31250000558793545,
"step": 100
},
{
"completion_length": 2749.250030517578,
"epoch": 0.11542857142857142,
"grad_norm": 0.18827371299266815,
"kl": 0.00228118896484375,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0001,
"reward": 0.03973736334592104,
"reward_std": 0.17379421554505825,
"rewards/cosine_scaled_reward": -0.15294075850397348,
"rewards/format_reward": 0.4583333544433117,
"step": 101
},
{
"completion_length": 2767.0834350585938,
"epoch": 0.11657142857142858,
"grad_norm": 0.16772513091564178,
"kl": 0.002315521240234375,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0001,
"reward": 0.0401492640376091,
"reward_std": 0.14047331921756268,
"rewards/cosine_scaled_reward": -0.11048189923167229,
"rewards/format_reward": 0.375,
"step": 102
},
{
"completion_length": 3021.2708740234375,
"epoch": 0.11771428571428572,
"grad_norm": 0.1802755743265152,
"kl": 0.001617431640625,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0001,
"reward": 0.0870060611050576,
"reward_std": 0.17526693642139435,
"rewards/cosine_scaled_reward": -0.03904236480593681,
"rewards/format_reward": 0.4166666716337204,
"step": 103
},
{
"completion_length": 2614.1458740234375,
"epoch": 0.11885714285714286,
"grad_norm": 0.1638043224811554,
"kl": 0.0009593963623046875,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0,
"reward": 0.23584382608532906,
"reward_std": 0.19611848145723343,
"rewards/cosine_scaled_reward": 0.20568424928933382,
"rewards/format_reward": 0.5,
"step": 104
},
{
"completion_length": 3105.791717529297,
"epoch": 0.12,
"grad_norm": 0.18571417033672333,
"kl": 0.00255584716796875,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0001,
"reward": -0.0297448318451643,
"reward_std": 0.1015834640711546,
"rewards/cosine_scaled_reward": -0.1741093248128891,
"rewards/format_reward": 0.2291666679084301,
"step": 105
},
{
"completion_length": 2815.8126220703125,
"epoch": 0.12114285714285715,
"grad_norm": 0.21194417774677277,
"kl": 0.0025634765625,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0001,
"reward": 0.11232324969023466,
"reward_std": 0.21721220761537552,
"rewards/cosine_scaled_reward": 0.01769975572824478,
"rewards/format_reward": 0.3958333432674408,
"step": 106
},
{
"completion_length": 2541.416748046875,
"epoch": 0.12228571428571429,
"grad_norm": 0.22722160816192627,
"kl": 0.002227783203125,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0001,
"reward": 0.019650347530841827,
"reward_std": 0.21003135293722153,
"rewards/cosine_scaled_reward": -0.17047632485628128,
"rewards/format_reward": 0.41666667722165585,
"step": 107
},
{
"completion_length": 2334.2500915527344,
"epoch": 0.12342857142857143,
"grad_norm": 0.24612505733966827,
"kl": 0.0037975311279296875,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0002,
"reward": 0.24177123652771115,
"reward_std": 0.19179029762744904,
"rewards/cosine_scaled_reward": 0.16906813159585,
"rewards/format_reward": 0.5833333432674408,
"step": 108
},
{
"completion_length": 2786.5833740234375,
"epoch": 0.12457142857142857,
"grad_norm": 0.23426760733127594,
"kl": 0.002147674560546875,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0001,
"reward": 0.019255569204688072,
"reward_std": 0.19491948932409286,
"rewards/cosine_scaled_reward": -0.15080422349274158,
"rewards/format_reward": 0.3750000149011612,
"step": 109
},
{
"completion_length": 2611.3959045410156,
"epoch": 0.12571428571428572,
"grad_norm": 0.2736619710922241,
"kl": 0.0035686492919921875,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0001,
"reward": 0.29651589691638947,
"reward_std": 0.18929306417703629,
"rewards/cosine_scaled_reward": 0.23898081667721272,
"rewards/format_reward": 0.6666666865348816,
"step": 110
},
{
"completion_length": 2047.7917175292969,
"epoch": 0.12685714285714286,
"grad_norm": 0.22474665939807892,
"kl": 0.00275421142578125,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0001,
"reward": 0.19512577797286212,
"reward_std": 0.17880065739154816,
"rewards/cosine_scaled_reward": 0.03617064421996474,
"rewards/format_reward": 0.666666679084301,
"step": 111
},
{
"completion_length": 2695.3958740234375,
"epoch": 0.128,
"grad_norm": 0.20437544584274292,
"kl": 0.0023288726806640625,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0001,
"reward": 0.18288636580109596,
"reward_std": 0.19504074566066265,
"rewards/cosine_scaled_reward": 0.07991745974868536,
"rewards/format_reward": 0.5416666716337204,
"step": 112
},
{
"completion_length": 2568.4583435058594,
"epoch": 0.12914285714285714,
"grad_norm": 0.20730017125606537,
"kl": 0.0066070556640625,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0003,
"reward": 0.13345283642411232,
"reward_std": 0.1859412807971239,
"rewards/cosine_scaled_reward": 0.004893161356449127,
"rewards/format_reward": 0.5,
"step": 113
},
{
"completion_length": 1677.6875305175781,
"epoch": 0.13028571428571428,
"grad_norm": 0.2240653932094574,
"kl": 0.0018825531005859375,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0001,
"reward": 0.27667421475052834,
"reward_std": 0.15631183050572872,
"rewards/cosine_scaled_reward": 0.16248321998864412,
"rewards/format_reward": 0.7291666865348816,
"step": 114
},
{
"completion_length": 2820.479248046875,
"epoch": 0.13142857142857142,
"grad_norm": 0.1784657984972,
"kl": 0.0027828216552734375,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0001,
"reward": 0.1664815954864025,
"reward_std": 0.1997350938618183,
"rewards/cosine_scaled_reward": 0.08072170801460743,
"rewards/format_reward": 0.4791666865348816,
"step": 115
},
{
"completion_length": 2746.4583740234375,
"epoch": 0.13257142857142856,
"grad_norm": 0.20344781875610352,
"kl": 0.00470733642578125,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0002,
"reward": 0.04535680764820427,
"reward_std": 0.10419335402548313,
"rewards/cosine_scaled_reward": -0.08311295229941607,
"rewards/format_reward": 0.3333333358168602,
"step": 116
},
{
"completion_length": 3282.0208740234375,
"epoch": 0.1337142857142857,
"grad_norm": 0.1357363760471344,
"kl": 0.002735137939453125,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0001,
"reward": -0.02954237163066864,
"reward_std": 0.14159450307488441,
"rewards/cosine_scaled_reward": -0.1510828686878085,
"rewards/format_reward": 0.1875,
"step": 117
},
{
"completion_length": 2032.229232788086,
"epoch": 0.13485714285714287,
"grad_norm": 0.27717867493629456,
"kl": 0.01053619384765625,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0004,
"reward": 0.13773571141064167,
"reward_std": 0.1858275569975376,
"rewards/cosine_scaled_reward": -0.0683070570230484,
"rewards/format_reward": 0.666666679084301,
"step": 118
},
{
"completion_length": 2059.5834197998047,
"epoch": 0.136,
"grad_norm": 0.25735679268836975,
"kl": 0.005496978759765625,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0002,
"reward": 0.10206396621651947,
"reward_std": 0.13856840506196022,
"rewards/cosine_scaled_reward": -0.10751124238595366,
"rewards/format_reward": 0.6041666865348816,
"step": 119
},
{
"completion_length": 2900.416748046875,
"epoch": 0.13714285714285715,
"grad_norm": 0.22331129014492035,
"kl": 0.0034637451171875,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0001,
"reward": 0.0223141775932163,
"reward_std": 0.2024293877184391,
"rewards/cosine_scaled_reward": -0.18740470334887505,
"rewards/format_reward": 0.4583333507180214,
"step": 120
},
{
"completion_length": 2943.8958740234375,
"epoch": 0.1382857142857143,
"grad_norm": 0.17821165919303894,
"kl": 0.002651214599609375,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0001,
"reward": 0.04273420386016369,
"reward_std": 0.1815544180572033,
"rewards/cosine_scaled_reward": -0.09382643923163414,
"rewards/format_reward": 0.354166679084301,
"step": 121
},
{
"completion_length": 2829.3958740234375,
"epoch": 0.13942857142857143,
"grad_norm": 0.21492768824100494,
"kl": 0.005435943603515625,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0002,
"reward": 0.006649336777627468,
"reward_std": 0.13518287613987923,
"rewards/cosine_scaled_reward": -0.15551936253905296,
"rewards/format_reward": 0.3333333358168602,
"step": 122
},
{
"completion_length": 2489.0000915527344,
"epoch": 0.14057142857142857,
"grad_norm": 0.5098468661308289,
"kl": 0.02829742431640625,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0011,
"reward": 0.12213594932109118,
"reward_std": 0.23784950748085976,
"rewards/cosine_scaled_reward": -0.005025926977396011,
"rewards/format_reward": 0.4791666828095913,
"step": 123
},
{
"completion_length": 2508.187530517578,
"epoch": 0.1417142857142857,
"grad_norm": 0.23071174323558807,
"kl": 0.0027313232421875,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0001,
"reward": 0.15378618612885475,
"reward_std": 0.2259940393269062,
"rewards/cosine_scaled_reward": 0.025300168432295322,
"rewards/format_reward": 0.5416666753590107,
"step": 124
},
{
"completion_length": 3111.6250610351562,
"epoch": 0.14285714285714285,
"grad_norm": 0.1613474041223526,
"kl": 0.0019855499267578125,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0001,
"reward": 0.00011996552348136902,
"reward_std": 0.12452885881066322,
"rewards/cosine_scaled_reward": -0.15928708389401436,
"rewards/format_reward": 0.3125,
"step": 125
},
{
"completion_length": 2480.2293090820312,
"epoch": 0.144,
"grad_norm": 0.23107463121414185,
"kl": 0.00391387939453125,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0002,
"reward": 0.10650707967579365,
"reward_std": 0.2277711071074009,
"rewards/cosine_scaled_reward": -0.06300333887338638,
"rewards/format_reward": 0.5416666865348816,
"step": 126
},
{
"completion_length": 2718.9376220703125,
"epoch": 0.14514285714285713,
"grad_norm": 0.16975504159927368,
"kl": 0.0023326873779296875,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0001,
"reward": 0.02971411682665348,
"reward_std": 0.1977171078324318,
"rewards/cosine_scaled_reward": -0.14219553396105766,
"rewards/format_reward": 0.3958333469927311,
"step": 127
},
{
"completion_length": 2266.666778564453,
"epoch": 0.1462857142857143,
"grad_norm": 0.18879002332687378,
"kl": 0.0033416748046875,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0001,
"reward": 0.15120768686756492,
"reward_std": 0.20895353704690933,
"rewards/cosine_scaled_reward": 0.01915425295010209,
"rewards/format_reward": 0.5416666716337204,
"step": 128
},
{
"completion_length": 2129.500015258789,
"epoch": 0.14742857142857144,
"grad_norm": 0.26337873935699463,
"kl": 0.0062255859375,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0002,
"reward": 0.1327032782137394,
"reward_std": 0.19304194673895836,
"rewards/cosine_scaled_reward": -0.03845389559864998,
"rewards/format_reward": 0.5833333488553762,
"step": 129
},
{
"completion_length": 2919.5208740234375,
"epoch": 0.14857142857142858,
"grad_norm": 0.1732492297887802,
"kl": 0.00247955322265625,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0001,
"reward": 0.02012626640498638,
"reward_std": 0.16613218560814857,
"rewards/cosine_scaled_reward": -0.1293521734769456,
"rewards/format_reward": 0.3333333507180214,
"step": 130
},
{
"completion_length": 2883.4375,
"epoch": 0.14971428571428572,
"grad_norm": 0.16014830768108368,
"kl": 0.0026702880859375,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0001,
"reward": 0.1132216090336442,
"reward_std": 0.16543111577630043,
"rewards/cosine_scaled_reward": -0.01893126592040062,
"rewards/format_reward": 0.479166679084301,
"step": 131
},
{
"completion_length": 3016.6875610351562,
"epoch": 0.15085714285714286,
"grad_norm": 0.18878363072872162,
"kl": 0.0045375823974609375,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0002,
"reward": 0.0002798512578010559,
"reward_std": 0.1798754744231701,
"rewards/cosine_scaled_reward": -0.13517173379659653,
"rewards/format_reward": 0.27083334885537624,
"step": 132
},
{
"completion_length": 2867.4584350585938,
"epoch": 0.152,
"grad_norm": 0.18914277851581573,
"kl": 0.004039764404296875,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0002,
"reward": 0.05362038780003786,
"reward_std": 0.1794121377170086,
"rewards/cosine_scaled_reward": -0.09410551190376282,
"rewards/format_reward": 0.3958333432674408,
"step": 133
},
{
"completion_length": 1913.1042175292969,
"epoch": 0.15314285714285714,
"grad_norm": 0.27556517720222473,
"kl": 0.00406646728515625,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0002,
"reward": 0.12812363170087337,
"reward_std": 0.16640142910182476,
"rewards/cosine_scaled_reward": -0.07728635333478451,
"rewards/format_reward": 0.645833358168602,
"step": 134
},
{
"completion_length": 2429.604217529297,
"epoch": 0.15428571428571428,
"grad_norm": 0.21053838729858398,
"kl": 0.003673553466796875,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0001,
"reward": 0.07179796043783426,
"reward_std": 0.13138782046735287,
"rewards/cosine_scaled_reward": -0.07374508306384087,
"rewards/format_reward": 0.4166666716337204,
"step": 135
},
{
"completion_length": 1984.0834045410156,
"epoch": 0.15542857142857142,
"grad_norm": 0.26108235120773315,
"kl": 0.003261566162109375,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0001,
"reward": 0.18163915304467082,
"reward_std": 0.17089967243373394,
"rewards/cosine_scaled_reward": -0.029379967600107193,
"rewards/format_reward": 0.75,
"step": 136
},
{
"completion_length": 2869.9583740234375,
"epoch": 0.15657142857142858,
"grad_norm": 0.20500995218753815,
"kl": 0.0051708221435546875,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0002,
"reward": -0.008513325825333595,
"reward_std": 0.16568787395954132,
"rewards/cosine_scaled_reward": -0.22801739536225796,
"rewards/format_reward": 0.4166666828095913,
"step": 137
},
{
"completion_length": 1847.3125610351562,
"epoch": 0.15771428571428572,
"grad_norm": 0.2537723481655121,
"kl": 0.003082275390625,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0001,
"reward": 0.2711644656956196,
"reward_std": 0.20930937677621841,
"rewards/cosine_scaled_reward": 0.16767838457599282,
"rewards/format_reward": 0.708333358168602,
"step": 138
},
{
"completion_length": 2193.291748046875,
"epoch": 0.15885714285714286,
"grad_norm": 0.21132107079029083,
"kl": 0.003009796142578125,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0001,
"reward": 0.1569109088741243,
"reward_std": 0.15476588159799576,
"rewards/cosine_scaled_reward": -0.025194160640239716,
"rewards/format_reward": 0.6458333432674408,
"step": 139
},
{
"completion_length": 2260.416717529297,
"epoch": 0.16,
"grad_norm": 0.2160802185535431,
"kl": 0.00319671630859375,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0001,
"reward": 0.13407661230303347,
"reward_std": 0.16906898841261864,
"rewards/cosine_scaled_reward": -0.06701312679797411,
"rewards/format_reward": 0.6458333432674408,
"step": 140
},
{
"completion_length": 2282.6875,
"epoch": 0.16114285714285714,
"grad_norm": 0.24112887680530548,
"kl": 0.0041351318359375,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0002,
"reward": 0.13663070276379585,
"reward_std": 0.1782714631408453,
"rewards/cosine_scaled_reward": -0.027498777955770493,
"rewards/format_reward": 0.583333358168602,
"step": 141
},
{
"completion_length": 2172.166748046875,
"epoch": 0.16228571428571428,
"grad_norm": 0.2610005736351013,
"kl": 0.003330230712890625,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0001,
"reward": 0.19508975371718407,
"reward_std": 0.21969266794621944,
"rewards/cosine_scaled_reward": 0.08089338196441531,
"rewards/format_reward": 0.583333358168602,
"step": 142
},
{
"completion_length": 2489.541717529297,
"epoch": 0.16342857142857142,
"grad_norm": 0.18988506495952606,
"kl": 0.0038480758666992188,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0002,
"reward": 0.16926367208361626,
"reward_std": 0.19583113491535187,
"rewards/cosine_scaled_reward": 0.03587030619382858,
"rewards/format_reward": 0.5833333544433117,
"step": 143
},
{
"completion_length": 2520.7084350585938,
"epoch": 0.16457142857142856,
"grad_norm": 0.20590053498744965,
"kl": 0.00385284423828125,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0002,
"reward": 0.16642944514751434,
"reward_std": 0.20522606186568737,
"rewards/cosine_scaled_reward": 0.040587374940514565,
"rewards/format_reward": 0.5625000260770321,
"step": 144
},
{
"completion_length": 2526.166748046875,
"epoch": 0.1657142857142857,
"grad_norm": 0.23322638869285583,
"kl": 0.0052337646484375,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0002,
"reward": 0.045561966486275196,
"reward_std": 0.2455296441912651,
"rewards/cosine_scaled_reward": -0.14990929747000337,
"rewards/format_reward": 0.4791666865348816,
"step": 145
},
{
"completion_length": 2197.1458740234375,
"epoch": 0.16685714285714287,
"grad_norm": 0.22786113619804382,
"kl": 0.00408172607421875,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0002,
"reward": 0.02551903622224927,
"reward_std": 0.1334901675581932,
"rewards/cosine_scaled_reward": -0.25134460628032684,
"rewards/format_reward": 0.6041666772216558,
"step": 146
},
{
"completion_length": 2167.7291717529297,
"epoch": 0.168,
"grad_norm": 0.255658894777298,
"kl": 0.00374603271484375,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0001,
"reward": 0.22885112185031176,
"reward_std": 0.1546173021197319,
"rewards/cosine_scaled_reward": 0.14800470299087465,
"rewards/format_reward": 0.5833333432674408,
"step": 147
},
{
"completion_length": 2297.666748046875,
"epoch": 0.16914285714285715,
"grad_norm": 0.2738894820213318,
"kl": 0.00521087646484375,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0002,
"reward": 0.03275088965892792,
"reward_std": 0.1455500442534685,
"rewards/cosine_scaled_reward": -0.2082662135362625,
"rewards/format_reward": 0.5416666716337204,
"step": 148
},
{
"completion_length": 2671.7709350585938,
"epoch": 0.1702857142857143,
"grad_norm": 0.20778509974479675,
"kl": 0.004669189453125,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0002,
"reward": 0.17761395254638046,
"reward_std": 0.18856573849916458,
"rewards/cosine_scaled_reward": 0.08179567754268646,
"rewards/format_reward": 0.5208333432674408,
"step": 149
},
{
"completion_length": 2050.166717529297,
"epoch": 0.17142857142857143,
"grad_norm": 0.22614718973636627,
"kl": 0.005207061767578125,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0002,
"reward": 0.07955310121178627,
"reward_std": 0.1677793301641941,
"rewards/cosine_scaled_reward": -0.17033380083739758,
"rewards/format_reward": 0.6458333395421505,
"step": 150
},
{
"completion_length": 2235.6459197998047,
"epoch": 0.17257142857142857,
"grad_norm": 0.19895651936531067,
"kl": 0.0030078887939453125,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0001,
"reward": 0.0866355006583035,
"reward_std": 0.10392699390649796,
"rewards/cosine_scaled_reward": -0.09595790691673756,
"rewards/format_reward": 0.5208333432674408,
"step": 151
},
{
"completion_length": 3005.9583740234375,
"epoch": 0.1737142857142857,
"grad_norm": 0.31027477979660034,
"kl": 0.0070343017578125,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0003,
"reward": -0.002403062768280506,
"reward_std": 0.1533808410167694,
"rewards/cosine_scaled_reward": -0.13117293268442154,
"rewards/format_reward": 0.25000001303851604,
"step": 152
},
{
"completion_length": 2327.3541870117188,
"epoch": 0.17485714285714285,
"grad_norm": 0.20010127127170563,
"kl": 0.0037221908569335938,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0001,
"reward": 0.23278480861335993,
"reward_std": 0.19316110759973526,
"rewards/cosine_scaled_reward": 0.12676820158958435,
"rewards/format_reward": 0.6458333432674408,
"step": 153
},
{
"completion_length": 2711.229248046875,
"epoch": 0.176,
"grad_norm": 0.2796197831630707,
"kl": 0.00414276123046875,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0002,
"reward": 0.15084031783044338,
"reward_std": 0.14097959361970425,
"rewards/cosine_scaled_reward": 0.0931478925049305,
"rewards/format_reward": 0.39583333395421505,
"step": 154
},
{
"completion_length": 2727.6875610351562,
"epoch": 0.17714285714285713,
"grad_norm": 0.2060837596654892,
"kl": 0.0041351318359375,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0002,
"reward": 0.026034665293991566,
"reward_std": 0.20783771388232708,
"rewards/cosine_scaled_reward": -0.17815358191728592,
"rewards/format_reward": 0.4583333395421505,
"step": 155
},
{
"completion_length": 2652.3125610351562,
"epoch": 0.1782857142857143,
"grad_norm": 0.1977294683456421,
"kl": 0.00499725341796875,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0002,
"reward": -0.00022369646467268467,
"reward_std": 0.10643414594233036,
"rewards/cosine_scaled_reward": -0.2093517892062664,
"rewards/format_reward": 0.4166666828095913,
"step": 156
},
{
"completion_length": 2042.7292175292969,
"epoch": 0.17942857142857144,
"grad_norm": 0.25187933444976807,
"kl": 0.0039520263671875,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0002,
"reward": 0.16695543192327023,
"reward_std": 0.23595153540372849,
"rewards/cosine_scaled_reward": -0.006288483738899231,
"rewards/format_reward": 0.645833358168602,
"step": 157
},
{
"completion_length": 2814.729217529297,
"epoch": 0.18057142857142858,
"grad_norm": 0.17815591394901276,
"kl": 0.00458526611328125,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0002,
"reward": -0.004143957048654556,
"reward_std": 0.11188133526593447,
"rewards/cosine_scaled_reward": -0.22808771207928658,
"rewards/format_reward": 0.43750000558793545,
"step": 158
},
{
"completion_length": 1992.2500610351562,
"epoch": 0.18171428571428572,
"grad_norm": 0.21933415532112122,
"kl": 0.004100799560546875,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0002,
"reward": 0.2522421330213547,
"reward_std": 0.22296695411205292,
"rewards/cosine_scaled_reward": 0.08081851835595444,
"rewards/format_reward": 0.8125000149011612,
"step": 159
},
{
"completion_length": 2079.416748046875,
"epoch": 0.18285714285714286,
"grad_norm": 0.19679833948612213,
"kl": 0.003993988037109375,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0002,
"reward": 0.18703680613543838,
"reward_std": 0.260627418756485,
"rewards/cosine_scaled_reward": 0.00598154217004776,
"rewards/format_reward": 0.7083333432674408,
"step": 160
},
{
"completion_length": 3009.541717529297,
"epoch": 0.184,
"grad_norm": 0.24472463130950928,
"kl": 0.005504608154296875,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0002,
"reward": 0.014636407606303692,
"reward_std": 0.1625995971262455,
"rewards/cosine_scaled_reward": -0.15822336450219154,
"rewards/format_reward": 0.37500000931322575,
"step": 161
},
{
"completion_length": 2617.3751220703125,
"epoch": 0.18514285714285714,
"grad_norm": 0.18487517535686493,
"kl": 0.0045013427734375,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0002,
"reward": 0.11570010334253311,
"reward_std": 0.17325943149626255,
"rewards/cosine_scaled_reward": -0.03621460869908333,
"rewards/format_reward": 0.5208333395421505,
"step": 162
},
{
"completion_length": 2124.4584045410156,
"epoch": 0.18628571428571428,
"grad_norm": 0.3504065275192261,
"kl": 0.006938934326171875,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0003,
"reward": 0.11949230777099729,
"reward_std": 0.21487346105277538,
"rewards/cosine_scaled_reward": -0.061273553408682346,
"rewards/format_reward": 0.5833333507180214,
"step": 163
},
{
"completion_length": 2052.791717529297,
"epoch": 0.18742857142857142,
"grad_norm": 0.27968698740005493,
"kl": 0.00577545166015625,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0002,
"reward": 0.0795913627371192,
"reward_std": 0.21591341868042946,
"rewards/cosine_scaled_reward": -0.12282621720805764,
"rewards/format_reward": 0.5416666716337204,
"step": 164
},
{
"completion_length": 2127.2708740234375,
"epoch": 0.18857142857142858,
"grad_norm": 0.21363244950771332,
"kl": 0.004779815673828125,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0002,
"reward": 0.19190740585327148,
"reward_std": 0.221625704318285,
"rewards/cosine_scaled_reward": 0.05086267925798893,
"rewards/format_reward": 0.6458333395421505,
"step": 165
},
{
"completion_length": 1618.8959350585938,
"epoch": 0.18971428571428572,
"grad_norm": 0.21441620588302612,
"kl": 0.0042877197265625,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0002,
"reward": 0.14088203478604555,
"reward_std": 0.12707579229027033,
"rewards/cosine_scaled_reward": -0.08422760479152203,
"rewards/format_reward": 0.7083333432674408,
"step": 166
},
{
"completion_length": 2476.7708740234375,
"epoch": 0.19085714285714286,
"grad_norm": 0.21957609057426453,
"kl": 0.005298614501953125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0002,
"reward": -0.01745962956920266,
"reward_std": 0.12951701134443283,
"rewards/cosine_scaled_reward": -0.2852119877934456,
"rewards/format_reward": 0.5000000111758709,
"step": 167
},
{
"completion_length": 1929.354248046875,
"epoch": 0.192,
"grad_norm": 0.23081496357917786,
"kl": 0.004123687744140625,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0002,
"reward": 0.13488853815943003,
"reward_std": 0.18247101455926895,
"rewards/cosine_scaled_reward": -0.1040015157777816,
"rewards/format_reward": 0.7291666865348816,
"step": 168
},
{
"completion_length": 2015.229232788086,
"epoch": 0.19314285714285714,
"grad_norm": 0.28857842087745667,
"kl": 0.004528045654296875,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0002,
"reward": 0.26697871275246143,
"reward_std": 0.20995871722698212,
"rewards/cosine_scaled_reward": 0.1499726166948676,
"rewards/format_reward": 0.729166679084301,
"step": 169
},
{
"completion_length": 2331.750030517578,
"epoch": 0.19428571428571428,
"grad_norm": 0.20006102323532104,
"kl": 0.00605010986328125,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0002,
"reward": 0.08345442125573754,
"reward_std": 0.19285878352820873,
"rewards/cosine_scaled_reward": -0.10043650306761265,
"rewards/format_reward": 0.5208333432674408,
"step": 170
},
{
"completion_length": 1634.4791870117188,
"epoch": 0.19542857142857142,
"grad_norm": 0.26344433426856995,
"kl": 0.00447845458984375,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0002,
"reward": 0.10782040096819401,
"reward_std": 0.13615941908210516,
"rewards/cosine_scaled_reward": -0.16944693960249424,
"rewards/format_reward": 0.7500000055879354,
"step": 171
},
{
"completion_length": 1950.7292022705078,
"epoch": 0.19657142857142856,
"grad_norm": 0.26507794857025146,
"kl": 0.005542755126953125,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0002,
"reward": 0.1530790887773037,
"reward_std": 0.220405962318182,
"rewards/cosine_scaled_reward": -0.01874719187617302,
"rewards/format_reward": 0.6250000149011612,
"step": 172
},
{
"completion_length": 2552.2500915527344,
"epoch": 0.1977142857142857,
"grad_norm": 0.29694148898124695,
"kl": 0.0063629150390625,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0003,
"reward": 0.12018180638551712,
"reward_std": 0.16073662787675858,
"rewards/cosine_scaled_reward": -0.01620076596736908,
"rewards/format_reward": 0.4791666679084301,
"step": 173
},
{
"completion_length": 2430.291717529297,
"epoch": 0.19885714285714284,
"grad_norm": 0.18767432868480682,
"kl": 0.007476806640625,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0003,
"reward": 0.03346575051546097,
"reward_std": 0.11665205657482147,
"rewards/cosine_scaled_reward": -0.1950590880587697,
"rewards/format_reward": 0.5208333432674408,
"step": 174
},
{
"completion_length": 2481.187545776367,
"epoch": 0.2,
"grad_norm": 0.26425570249557495,
"kl": 0.00807952880859375,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0003,
"reward": 0.14548239950090647,
"reward_std": 0.17145178094506264,
"rewards/cosine_scaled_reward": 0.005486873909831047,
"rewards/format_reward": 0.5416666716337204,
"step": 175
},
{
"completion_length": 1953.3125305175781,
"epoch": 0.20114285714285715,
"grad_norm": 0.2169903814792633,
"kl": 0.006252288818359375,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0003,
"reward": 0.15049799345433712,
"reward_std": 0.18127938732504845,
"rewards/cosine_scaled_reward": -0.06541152065619826,
"rewards/format_reward": 0.708333358168602,
"step": 176
},
{
"completion_length": 1905.7083740234375,
"epoch": 0.2022857142857143,
"grad_norm": 0.2057493031024933,
"kl": 0.0059356689453125,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0002,
"reward": 0.20941531658172607,
"reward_std": 0.18847975879907608,
"rewards/cosine_scaled_reward": 0.06081422168063,
"rewards/format_reward": 0.6875000298023224,
"step": 177
},
{
"completion_length": 1686.2709045410156,
"epoch": 0.20342857142857143,
"grad_norm": 0.2742275595664978,
"kl": 0.00555419921875,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0002,
"reward": 0.282832570374012,
"reward_std": 0.15610528737306595,
"rewards/cosine_scaled_reward": 0.13618910312652588,
"rewards/format_reward": 0.8125000149011612,
"step": 178
},
{
"completion_length": 2144.1251220703125,
"epoch": 0.20457142857142857,
"grad_norm": 0.20610445737838745,
"kl": 0.00775909423828125,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0003,
"reward": 0.18123364634811878,
"reward_std": 0.16907534934580326,
"rewards/cosine_scaled_reward": 0.023578599095344543,
"rewards/format_reward": 0.6458333432674408,
"step": 179
},
{
"completion_length": 2225.3959350585938,
"epoch": 0.2057142857142857,
"grad_norm": 0.2948279082775116,
"kl": 0.008434295654296875,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0003,
"reward": 0.10301533341407776,
"reward_std": 0.23226897418498993,
"rewards/cosine_scaled_reward": -0.09574815258383751,
"rewards/format_reward": 0.5833333488553762,
"step": 180
},
{
"completion_length": 2334.104217529297,
"epoch": 0.20685714285714285,
"grad_norm": 0.21989773213863373,
"kl": 0.004322052001953125,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0002,
"reward": 0.1680104963015765,
"reward_std": 0.2741067036986351,
"rewards/cosine_scaled_reward": -0.010420721024274826,
"rewards/format_reward": 0.6666666865348816,
"step": 181
},
{
"completion_length": 1716.2500305175781,
"epoch": 0.208,
"grad_norm": 0.2736404240131378,
"kl": 0.006591796875,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0003,
"reward": 0.19305634032934904,
"reward_std": 0.17649215832352638,
"rewards/cosine_scaled_reward": -0.020499907433986664,
"rewards/format_reward": 0.7708333507180214,
"step": 182
},
{
"completion_length": 1984.6459045410156,
"epoch": 0.20914285714285713,
"grad_norm": 0.24810387194156647,
"kl": 0.00687408447265625,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0003,
"reward": 0.11633813101798296,
"reward_std": 0.19054009392857552,
"rewards/cosine_scaled_reward": -0.16023868951015174,
"rewards/format_reward": 0.770833358168602,
"step": 183
},
{
"completion_length": 2012.0834045410156,
"epoch": 0.2102857142857143,
"grad_norm": 0.24479207396507263,
"kl": 0.00539398193359375,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0002,
"reward": 0.20850215945392847,
"reward_std": 0.24574441090226173,
"rewards/cosine_scaled_reward": 0.05496228300035,
"rewards/format_reward": 0.6875000149011612,
"step": 184
},
{
"completion_length": 2561.9166870117188,
"epoch": 0.21142857142857144,
"grad_norm": 0.23638688027858734,
"kl": 0.00787353515625,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0003,
"reward": 0.037306661397451535,
"reward_std": 0.1875557340681553,
"rewards/cosine_scaled_reward": -0.17976870480924845,
"rewards/format_reward": 0.5000000260770321,
"step": 185
},
{
"completion_length": 2453.5625610351562,
"epoch": 0.21257142857142858,
"grad_norm": 0.30098989605903625,
"kl": 0.0072784423828125,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0003,
"reward": 0.15216808393597603,
"reward_std": 0.17800051532685757,
"rewards/cosine_scaled_reward": -0.01129375584423542,
"rewards/format_reward": 0.6041666716337204,
"step": 186
},
{
"completion_length": 2095.1250610351562,
"epoch": 0.21371428571428572,
"grad_norm": 0.20456546545028687,
"kl": 0.00882720947265625,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0004,
"reward": 0.27147069573402405,
"reward_std": 0.18336978182196617,
"rewards/cosine_scaled_reward": 0.13264761865139008,
"rewards/format_reward": 0.7708333432674408,
"step": 187
},
{
"completion_length": 1552.937515258789,
"epoch": 0.21485714285714286,
"grad_norm": 0.25879716873168945,
"kl": 0.00807952880859375,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0003,
"reward": 0.29217225313186646,
"reward_std": 0.23792832344770432,
"rewards/cosine_scaled_reward": 0.1374435918405652,
"rewards/format_reward": 0.8541666716337204,
"step": 188
},
{
"completion_length": 1397.5000305175781,
"epoch": 0.216,
"grad_norm": 0.2867569625377655,
"kl": 0.00785064697265625,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0003,
"reward": 0.16837791539728642,
"reward_std": 0.17263205349445343,
"rewards/cosine_scaled_reward": -0.12355193216353655,
"rewards/format_reward": 0.8958333432674408,
"step": 189
},
{
"completion_length": 2409.7501220703125,
"epoch": 0.21714285714285714,
"grad_norm": 0.23366515338420868,
"kl": 0.00925445556640625,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0004,
"reward": 0.08992326661245897,
"reward_std": 0.2077214140444994,
"rewards/cosine_scaled_reward": -0.05378926917910576,
"rewards/format_reward": 0.4583333469927311,
"step": 190
},
{
"completion_length": 1840.2708587646484,
"epoch": 0.21828571428571428,
"grad_norm": 0.24777474999427795,
"kl": 0.006641387939453125,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0003,
"reward": 0.1860494278371334,
"reward_std": 0.18764295056462288,
"rewards/cosine_scaled_reward": -0.014890416525304317,
"rewards/format_reward": 0.7500000149011612,
"step": 191
},
{
"completion_length": 2362.5208740234375,
"epoch": 0.21942857142857142,
"grad_norm": 0.24280861020088196,
"kl": 0.0124053955078125,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0005,
"reward": 0.0900897765532136,
"reward_std": 0.21396854892373085,
"rewards/cosine_scaled_reward": -0.11646672445931472,
"rewards/format_reward": 0.583333358168602,
"step": 192
},
{
"completion_length": 1967.1250915527344,
"epoch": 0.22057142857142858,
"grad_norm": 0.24913279712200165,
"kl": 0.01198577880859375,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0005,
"reward": 0.12212350871413946,
"reward_std": 0.1075000325217843,
"rewards/cosine_scaled_reward": -0.1432434804737568,
"rewards/format_reward": 0.7500000149011612,
"step": 193
},
{
"completion_length": 940.7083740234375,
"epoch": 0.22171428571428572,
"grad_norm": 0.30179327726364136,
"kl": 0.0063323974609375,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0003,
"reward": 0.4154938831925392,
"reward_std": 0.16113552078604698,
"rewards/cosine_scaled_reward": 0.308564942330122,
"rewards/format_reward": 0.9791666716337204,
"step": 194
},
{
"completion_length": 1250.1042175292969,
"epoch": 0.22285714285714286,
"grad_norm": 0.27229517698287964,
"kl": 0.0104827880859375,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0004,
"reward": 0.3013657033443451,
"reward_std": 0.14612397830933332,
"rewards/cosine_scaled_reward": 0.10115441353991628,
"rewards/format_reward": 0.9583333432674408,
"step": 195
},
{
"completion_length": 2639.7709350585938,
"epoch": 0.224,
"grad_norm": 0.27179622650146484,
"kl": 0.0116729736328125,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0005,
"reward": 0.08495328156277537,
"reward_std": 0.19798987358808517,
"rewards/cosine_scaled_reward": -0.04823115328326821,
"rewards/format_reward": 0.41666667722165585,
"step": 196
},
{
"completion_length": 2250.2708740234375,
"epoch": 0.22514285714285714,
"grad_norm": 0.2959842383861542,
"kl": 0.00969696044921875,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0004,
"reward": 0.05214718542993069,
"reward_std": 0.17444902658462524,
"rewards/cosine_scaled_reward": -0.19195930659770966,
"rewards/format_reward": 0.5833333544433117,
"step": 197
},
{
"completion_length": 1825.916748046875,
"epoch": 0.22628571428571428,
"grad_norm": 0.21343165636062622,
"kl": 0.00766754150390625,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0003,
"reward": 0.20536806993186474,
"reward_std": 0.19084695354104042,
"rewards/cosine_scaled_reward": -0.04184096306562424,
"rewards/format_reward": 0.8750000149011612,
"step": 198
},
{
"completion_length": 1967.9167175292969,
"epoch": 0.22742857142857142,
"grad_norm": 0.22258897125720978,
"kl": 0.00786590576171875,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0003,
"reward": 0.30358556658029556,
"reward_std": 0.20891420915722847,
"rewards/cosine_scaled_reward": 0.15997123159468174,
"rewards/format_reward": 0.8541666865348816,
"step": 199
},
{
"completion_length": 1702.916748046875,
"epoch": 0.22857142857142856,
"grad_norm": 0.3150264322757721,
"kl": 0.00884246826171875,
"learning_rate": 7.75e-07,
"loss": 0.0004,
"reward": 0.23448583600111306,
"reward_std": 0.21412995643913746,
"rewards/cosine_scaled_reward": 0.06124690920114517,
"rewards/format_reward": 0.7708333432674408,
"step": 200
},
{
"completion_length": 1527.5625610351562,
"epoch": 0.2297142857142857,
"grad_norm": 0.23923619091510773,
"kl": 0.0072479248046875,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0003,
"reward": 0.198734937235713,
"reward_std": 0.17404086515307426,
"rewards/cosine_scaled_reward": -0.06422888732049614,
"rewards/format_reward": 0.895833358168602,
"step": 201
},
{
"completion_length": 1109.9375457763672,
"epoch": 0.23085714285714284,
"grad_norm": 0.25801074504852295,
"kl": 0.00966644287109375,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0004,
"reward": 0.23950592055916786,
"reward_std": 0.1612282581627369,
"rewards/cosine_scaled_reward": -0.01809925213456154,
"rewards/format_reward": 0.9583333432674408,
"step": 202
},
{
"completion_length": 1692.8333740234375,
"epoch": 0.232,
"grad_norm": 0.31517601013183594,
"kl": 0.0102996826171875,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0004,
"reward": 0.24487797170877457,
"reward_std": 0.21635670214891434,
"rewards/cosine_scaled_reward": 0.08476148918271065,
"rewards/format_reward": 0.7708333432674408,
"step": 203
},
{
"completion_length": 1215.8750305175781,
"epoch": 0.23314285714285715,
"grad_norm": 0.2657667398452759,
"kl": 0.00862884521484375,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0003,
"reward": 0.2890103794634342,
"reward_std": 0.24866387993097305,
"rewards/cosine_scaled_reward": 0.1112285777926445,
"rewards/format_reward": 0.8750000149011612,
"step": 204
},
{
"completion_length": 1525.3333435058594,
"epoch": 0.2342857142857143,
"grad_norm": 0.21206864714622498,
"kl": 0.00908660888671875,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0004,
"reward": 0.23077455908060074,
"reward_std": 0.1799892894923687,
"rewards/cosine_scaled_reward": -0.032498230517376214,
"rewards/format_reward": 0.9583333432674408,
"step": 205
},
{
"completion_length": 2053.3125610351562,
"epoch": 0.23542857142857143,
"grad_norm": 0.28847047686576843,
"kl": 0.0101165771484375,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0004,
"reward": 0.11000457312911749,
"reward_std": 0.18665008433163166,
"rewards/cosine_scaled_reward": -0.14993075653910637,
"rewards/format_reward": 0.7291666865348816,
"step": 206
},
{
"completion_length": 1849.9167175292969,
"epoch": 0.23657142857142857,
"grad_norm": 0.19775180518627167,
"kl": 0.006805419921875,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0003,
"reward": 0.2954826056957245,
"reward_std": 0.25379542633891106,
"rewards/cosine_scaled_reward": 0.12750269658863544,
"rewards/format_reward": 0.8750000149011612,
"step": 207
},
{
"completion_length": 1929.041748046875,
"epoch": 0.2377142857142857,
"grad_norm": 0.25739434361457825,
"kl": 0.009552001953125,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0004,
"reward": 0.1019433755427599,
"reward_std": 0.13344106450676918,
"rewards/cosine_scaled_reward": -0.15905495546758175,
"rewards/format_reward": 0.7083333432674408,
"step": 208
},
{
"completion_length": 1011.3333740234375,
"epoch": 0.23885714285714285,
"grad_norm": 0.3413412272930145,
"kl": 0.00860595703125,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0003,
"reward": 0.17312408424913883,
"reward_std": 0.20489512011408806,
"rewards/cosine_scaled_reward": -0.11678014509379864,
"rewards/format_reward": 0.8958333432674408,
"step": 209
},
{
"completion_length": 1356.7084045410156,
"epoch": 0.24,
"grad_norm": 0.23836910724639893,
"kl": 0.00788116455078125,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0003,
"reward": 0.31968575716018677,
"reward_std": 0.156296506524086,
"rewards/cosine_scaled_reward": 0.15581995248794556,
"rewards/format_reward": 0.9166666716337204,
"step": 210
},
{
"completion_length": 2032.6458740234375,
"epoch": 0.24114285714285713,
"grad_norm": 0.23086276650428772,
"kl": 0.0112152099609375,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0004,
"reward": 0.13852777890861034,
"reward_std": 0.1855682022869587,
"rewards/cosine_scaled_reward": -0.12528848741203547,
"rewards/format_reward": 0.7916666865348816,
"step": 211
},
{
"completion_length": 1474.5416717529297,
"epoch": 0.2422857142857143,
"grad_norm": 0.2737053632736206,
"kl": 0.009735107421875,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0004,
"reward": 0.2501720953732729,
"reward_std": 0.19472362473607063,
"rewards/cosine_scaled_reward": 0.0749430526047945,
"rewards/format_reward": 0.8125000149011612,
"step": 212
},
{
"completion_length": 1883.7500610351562,
"epoch": 0.24342857142857144,
"grad_norm": 0.2124582827091217,
"kl": 0.01052093505859375,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0004,
"reward": 0.0875893197953701,
"reward_std": 0.12948580272495747,
"rewards/cosine_scaled_reward": -0.19460760243237019,
"rewards/format_reward": 0.7291666865348816,
"step": 213
},
{
"completion_length": 1905.2500610351562,
"epoch": 0.24457142857142858,
"grad_norm": 0.1766165941953659,
"kl": 0.0074462890625,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0003,
"reward": 0.17481125239282846,
"reward_std": 0.18312595039606094,
"rewards/cosine_scaled_reward": -0.08106975071132183,
"rewards/format_reward": 0.8333333432674408,
"step": 214
},
{
"completion_length": 1422.0209045410156,
"epoch": 0.24571428571428572,
"grad_norm": 0.20306651294231415,
"kl": 0.00847625732421875,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0003,
"reward": 0.2214849442243576,
"reward_std": 0.23549797013401985,
"rewards/cosine_scaled_reward": -0.06182933505624533,
"rewards/format_reward": 0.9791666716337204,
"step": 215
},
{
"completion_length": 1808.1458740234375,
"epoch": 0.24685714285714286,
"grad_norm": 0.18333780765533447,
"kl": 0.00699615478515625,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0003,
"reward": 0.1494435027707368,
"reward_std": 0.15389228984713554,
"rewards/cosine_scaled_reward": -0.15077029541134834,
"rewards/format_reward": 0.875,
"step": 216
},
{
"completion_length": 1527.5833740234375,
"epoch": 0.248,
"grad_norm": 0.23145990073680878,
"kl": 0.0092010498046875,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0004,
"reward": 0.2188200056552887,
"reward_std": 0.15189463831484318,
"rewards/cosine_scaled_reward": -0.015956051647663116,
"rewards/format_reward": 0.8750000149011612,
"step": 217
},
{
"completion_length": 1596.1250305175781,
"epoch": 0.24914285714285714,
"grad_norm": 0.22957631945610046,
"kl": 0.00939178466796875,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0004,
"reward": 0.2616739912191406,
"reward_std": 0.1865678783506155,
"rewards/cosine_scaled_reward": 0.09852963499724865,
"rewards/format_reward": 0.8125000149011612,
"step": 218
},
{
"completion_length": 1126.3541870117188,
"epoch": 0.2502857142857143,
"grad_norm": 0.24680417776107788,
"kl": 0.0069427490234375,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0003,
"reward": 0.21046569012105465,
"reward_std": 0.19767357036471367,
"rewards/cosine_scaled_reward": -0.08539294765796512,
"rewards/format_reward": 0.9791666716337204,
"step": 219
},
{
"completion_length": 1128.4583740234375,
"epoch": 0.25142857142857145,
"grad_norm": 0.2578405439853668,
"kl": 0.009613037109375,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0004,
"reward": 0.23143617436289787,
"reward_std": 0.1721612773835659,
"rewards/cosine_scaled_reward": -0.04716856777667999,
"rewards/format_reward": 0.9791666716337204,
"step": 220
},
{
"completion_length": 1951.7709045410156,
"epoch": 0.25257142857142856,
"grad_norm": 0.33379021286964417,
"kl": 0.0142974853515625,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0006,
"reward": 0.13845073012635112,
"reward_std": 0.14940405637025833,
"rewards/cosine_scaled_reward": -0.045577242970466614,
"rewards/format_reward": 0.6250000223517418,
"step": 221
},
{
"completion_length": 2032.8125305175781,
"epoch": 0.2537142857142857,
"grad_norm": 0.20354242622852325,
"kl": 0.009674072265625,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0004,
"reward": 0.041867110412567854,
"reward_std": 0.11502710357308388,
"rewards/cosine_scaled_reward": -0.28463663905858994,
"rewards/format_reward": 0.7291666716337204,
"step": 222
},
{
"completion_length": 1453.1250305175781,
"epoch": 0.25485714285714284,
"grad_norm": 0.22267840802669525,
"kl": 0.0079193115234375,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0003,
"reward": 0.1836735513061285,
"reward_std": 0.16883151605725288,
"rewards/cosine_scaled_reward": -0.10032966919243336,
"rewards/format_reward": 0.9166666716337204,
"step": 223
},
{
"completion_length": 1780.9583740234375,
"epoch": 0.256,
"grad_norm": 0.2752217948436737,
"kl": 0.00952911376953125,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0004,
"reward": 0.21352362632751465,
"reward_std": 0.1610415056347847,
"rewards/cosine_scaled_reward": -0.00915946438908577,
"rewards/format_reward": 0.833333358168602,
"step": 224
},
{
"completion_length": 1222.2708435058594,
"epoch": 0.2571428571428571,
"grad_norm": 0.21294601261615753,
"kl": 0.007015228271484375,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0003,
"reward": 0.3245595395565033,
"reward_std": 0.10259661450982094,
"rewards/cosine_scaled_reward": 0.1599423922598362,
"rewards/format_reward": 0.9375,
"step": 225
},
{
"completion_length": 1372.7708587646484,
"epoch": 0.2582857142857143,
"grad_norm": 0.3051982522010803,
"kl": 0.012664794921875,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0005,
"reward": 0.09605667972937226,
"reward_std": 0.11557916086167097,
"rewards/cosine_scaled_reward": -0.24685227498412132,
"rewards/format_reward": 0.8541666865348816,
"step": 226
},
{
"completion_length": 1197.4375305175781,
"epoch": 0.25942857142857145,
"grad_norm": 0.27726492285728455,
"kl": 0.0123291015625,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0005,
"reward": 0.21336308866739273,
"reward_std": 0.1688038632273674,
"rewards/cosine_scaled_reward": -0.05070815235376358,
"rewards/format_reward": 0.9166666716337204,
"step": 227
},
{
"completion_length": 1539.5208740234375,
"epoch": 0.26057142857142856,
"grad_norm": 0.28033754229545593,
"kl": 0.01001739501953125,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0004,
"reward": 0.1344006136059761,
"reward_std": 0.18115575425326824,
"rewards/cosine_scaled_reward": -0.14776835404336452,
"rewards/format_reward": 0.8125,
"step": 228
},
{
"completion_length": 1288.6250457763672,
"epoch": 0.26171428571428573,
"grad_norm": 0.3743990361690521,
"kl": 0.0155181884765625,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0006,
"reward": 0.15809894306585193,
"reward_std": 0.15935586765408516,
"rewards/cosine_scaled_reward": -0.12152018398046494,
"rewards/format_reward": 0.8541666865348816,
"step": 229
},
{
"completion_length": 1595.8750305175781,
"epoch": 0.26285714285714284,
"grad_norm": 0.32262828946113586,
"kl": 0.01462554931640625,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0006,
"reward": 0.22118227370083332,
"reward_std": 0.22235484700649977,
"rewards/cosine_scaled_reward": 0.02892589569091797,
"rewards/format_reward": 0.7916666716337204,
"step": 230
},
{
"completion_length": 1655.1250610351562,
"epoch": 0.264,
"grad_norm": 0.2636210322380066,
"kl": 0.01177978515625,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0005,
"reward": 0.2390340268611908,
"reward_std": 0.22538743168115616,
"rewards/cosine_scaled_reward": 0.025473197922110558,
"rewards/format_reward": 0.8750000298023224,
"step": 231
},
{
"completion_length": 1273.1250305175781,
"epoch": 0.2651428571428571,
"grad_norm": 0.20838648080825806,
"kl": 0.01004791259765625,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0004,
"reward": 0.2724638655781746,
"reward_std": 0.24231833592057228,
"rewards/cosine_scaled_reward": 0.0865764303598553,
"rewards/format_reward": 0.8750000149011612,
"step": 232
},
{
"completion_length": 1766.0416870117188,
"epoch": 0.2662857142857143,
"grad_norm": 0.22196227312088013,
"kl": 0.0135498046875,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0005,
"reward": 0.2252907119691372,
"reward_std": 0.18449927121400833,
"rewards/cosine_scaled_reward": 0.013371981680393219,
"rewards/format_reward": 0.8541666865348816,
"step": 233
},
{
"completion_length": 1795.4375610351562,
"epoch": 0.2674285714285714,
"grad_norm": 0.22926455736160278,
"kl": 0.013641357421875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0005,
"reward": 0.18019726127386093,
"reward_std": 0.1873026303946972,
"rewards/cosine_scaled_reward": -0.057943904772400856,
"rewards/format_reward": 0.8125000149011612,
"step": 234
},
{
"completion_length": 1416.2500610351562,
"epoch": 0.26857142857142857,
"grad_norm": 0.4748923182487488,
"kl": 0.01238250732421875,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0005,
"reward": 0.2144976705312729,
"reward_std": 0.1543455570936203,
"rewards/cosine_scaled_reward": -0.017965801060199738,
"rewards/format_reward": 0.8541666716337204,
"step": 235
},
{
"completion_length": 1684.5208740234375,
"epoch": 0.26971428571428574,
"grad_norm": 0.27250581979751587,
"kl": 0.0109100341796875,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0004,
"reward": 0.15142692811787128,
"reward_std": 0.123539624735713,
"rewards/cosine_scaled_reward": -0.15956238843500614,
"rewards/format_reward": 0.8958333432674408,
"step": 236
},
{
"completion_length": 1095.9583435058594,
"epoch": 0.27085714285714285,
"grad_norm": 0.23535336554050446,
"kl": 0.0092010498046875,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0004,
"reward": 0.2338778730481863,
"reward_std": 0.13493703678250313,
"rewards/cosine_scaled_reward": -0.05256740562617779,
"rewards/format_reward": 1.0,
"step": 237
},
{
"completion_length": 1308.5000305175781,
"epoch": 0.272,
"grad_norm": 0.4510970115661621,
"kl": 0.013580322265625,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0005,
"reward": 0.25004918687045574,
"reward_std": 0.20090295001864433,
"rewards/cosine_scaled_reward": 0.025465428829193115,
"rewards/format_reward": 0.8958333432674408,
"step": 238
},
{
"completion_length": 2084.7500610351562,
"epoch": 0.27314285714285713,
"grad_norm": 0.41042184829711914,
"kl": 0.01446533203125,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0006,
"reward": 0.06340811308473349,
"reward_std": 0.18164737150073051,
"rewards/cosine_scaled_reward": -0.22181823663413525,
"rewards/format_reward": 0.6875000223517418,
"step": 239
},
{
"completion_length": 1270.0417175292969,
"epoch": 0.2742857142857143,
"grad_norm": 0.2745667099952698,
"kl": 0.01023101806640625,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0004,
"reward": 0.3920341283082962,
"reward_std": 0.14432355761528015,
"rewards/cosine_scaled_reward": 0.28012172505259514,
"rewards/format_reward": 0.9583333432674408,
"step": 240
},
{
"completion_length": 1518.3958740234375,
"epoch": 0.2754285714285714,
"grad_norm": 0.4780232310295105,
"kl": 0.018524169921875,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0007,
"reward": 0.21057775150984526,
"reward_std": 0.19549552723765373,
"rewards/cosine_scaled_reward": -0.04673420591279864,
"rewards/format_reward": 0.8958333432674408,
"step": 241
},
{
"completion_length": 1732.7500610351562,
"epoch": 0.2765714285714286,
"grad_norm": 0.47158530354499817,
"kl": 0.0208740234375,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0008,
"reward": 0.1667325645685196,
"reward_std": 0.24624676629900932,
"rewards/cosine_scaled_reward": -0.09344856068491936,
"rewards/format_reward": 0.8125000149011612,
"step": 242
},
{
"completion_length": 1035.7083587646484,
"epoch": 0.2777142857142857,
"grad_norm": 0.22557544708251953,
"kl": 0.00783538818359375,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0003,
"reward": 0.22993716970086098,
"reward_std": 0.08992603048682213,
"rewards/cosine_scaled_reward": -0.05973348394036293,
"rewards/format_reward": 1.0,
"step": 243
},
{
"completion_length": 1274.2708740234375,
"epoch": 0.27885714285714286,
"grad_norm": 0.24874719977378845,
"kl": 0.00920867919921875,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0004,
"reward": 0.2895762138068676,
"reward_std": 0.237772386521101,
"rewards/cosine_scaled_reward": 0.10733084753155708,
"rewards/format_reward": 0.895833358168602,
"step": 244
},
{
"completion_length": 1134.4167175292969,
"epoch": 0.28,
"grad_norm": 0.29308614134788513,
"kl": 0.00977325439453125,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0004,
"reward": 0.2019298244267702,
"reward_std": 0.14933411590754986,
"rewards/cosine_scaled_reward": -0.09132302179932594,
"rewards/format_reward": 0.9583333432674408,
"step": 245
},
{
"completion_length": 1158.8542175292969,
"epoch": 0.28114285714285714,
"grad_norm": 0.3501374423503876,
"kl": 0.012237548828125,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0005,
"reward": 0.25629024021327496,
"reward_std": 0.132151298224926,
"rewards/cosine_scaled_reward": 0.014732152223587036,
"rewards/format_reward": 0.9583333432674408,
"step": 246
},
{
"completion_length": 1158.7292022705078,
"epoch": 0.2822857142857143,
"grad_norm": 0.3884400427341461,
"kl": 0.0135498046875,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0005,
"reward": 0.3844507783651352,
"reward_std": 0.1857384592294693,
"rewards/cosine_scaled_reward": 0.250643078237772,
"rewards/format_reward": 0.9583333432674408,
"step": 247
},
{
"completion_length": 1636.3750610351562,
"epoch": 0.2834285714285714,
"grad_norm": 0.4415358304977417,
"kl": 0.01995849609375,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0008,
"reward": 0.15714343590661883,
"reward_std": 0.15885592438280582,
"rewards/cosine_scaled_reward": -0.10814489889889956,
"rewards/format_reward": 0.8125,
"step": 248
},
{
"completion_length": 1972.7084045410156,
"epoch": 0.2845714285714286,
"grad_norm": 0.35937413573265076,
"kl": 0.0176849365234375,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0007,
"reward": 0.16410245560109615,
"reward_std": 0.23904583044350147,
"rewards/cosine_scaled_reward": -0.05793091654777527,
"rewards/format_reward": 0.7500000111758709,
"step": 249
},
{
"completion_length": 1539.4375457763672,
"epoch": 0.2857142857142857,
"grad_norm": 0.26806238293647766,
"kl": 0.02465057373046875,
"learning_rate": 6.281416799501187e-07,
"loss": 0.001,
"reward": 0.25902947783470154,
"reward_std": 0.24061259999871254,
"rewards/cosine_scaled_reward": 0.08197902701795101,
"rewards/format_reward": 0.833333358168602,
"step": 250
},
{
"completion_length": 2087.291717529297,
"epoch": 0.28685714285714287,
"grad_norm": 0.42723548412323,
"kl": 0.031829833984375,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0013,
"reward": 0.0892067551612854,
"reward_std": 0.2596677578985691,
"rewards/cosine_scaled_reward": -0.06505941599607468,
"rewards/format_reward": 0.479166679084301,
"step": 251
},
{
"completion_length": 1393.5000305175781,
"epoch": 0.288,
"grad_norm": 0.4383827745914459,
"kl": 0.01575469970703125,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0006,
"reward": 0.27515115961432457,
"reward_std": 0.14459671452641487,
"rewards/cosine_scaled_reward": 0.06821848452091217,
"rewards/format_reward": 0.9166666716337204,
"step": 252
},
{
"completion_length": 1605.9792175292969,
"epoch": 0.28914285714285715,
"grad_norm": 0.8857656121253967,
"kl": 0.0370941162109375,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0015,
"reward": 0.17214620485901833,
"reward_std": 0.18065106682479382,
"rewards/cosine_scaled_reward": -0.007656781002879143,
"rewards/format_reward": 0.666666679084301,
"step": 253
},
{
"completion_length": 1118.2708740234375,
"epoch": 0.29028571428571426,
"grad_norm": 0.39530983567237854,
"kl": 0.018463134765625,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0007,
"reward": 0.30500828090589494,
"reward_std": 0.14383957721292973,
"rewards/cosine_scaled_reward": 0.20514516159892082,
"rewards/format_reward": 0.7500000149011612,
"step": 254
},
{
"completion_length": 1547.8958587646484,
"epoch": 0.2914285714285714,
"grad_norm": 0.40311965346336365,
"kl": 0.0308685302734375,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0012,
"reward": 0.232159405015409,
"reward_std": 0.16861450299620628,
"rewards/cosine_scaled_reward": 0.06933547928929329,
"rewards/format_reward": 0.7500000149011612,
"step": 255
},
{
"completion_length": 1279.7083435058594,
"epoch": 0.2925714285714286,
"grad_norm": 1.5129071474075317,
"kl": 0.02239990234375,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0009,
"reward": 0.226328669115901,
"reward_std": 0.173635708168149,
"rewards/cosine_scaled_reward": -0.02187468856573105,
"rewards/format_reward": 0.9166666716337204,
"step": 256
},
{
"completion_length": 1523.9375305175781,
"epoch": 0.2937142857142857,
"grad_norm": 0.38841739296913147,
"kl": 0.0191497802734375,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0008,
"reward": 0.14000426977872849,
"reward_std": 0.14408636838197708,
"rewards/cosine_scaled_reward": -0.1849204022437334,
"rewards/format_reward": 0.9166666865348816,
"step": 257
},
{
"completion_length": 1983.7084045410156,
"epoch": 0.2948571428571429,
"grad_norm": 0.3051685690879822,
"kl": 0.029571533203125,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0012,
"reward": 0.22874920442700386,
"reward_std": 0.22693637385964394,
"rewards/cosine_scaled_reward": 0.04347135126590729,
"rewards/format_reward": 0.7916666865348816,
"step": 258
},
{
"completion_length": 1227.4791870117188,
"epoch": 0.296,
"grad_norm": 0.3898649215698242,
"kl": 0.02254486083984375,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0009,
"reward": 0.19578362628817558,
"reward_std": 0.17094986885786057,
"rewards/cosine_scaled_reward": -0.07543798349797726,
"rewards/format_reward": 0.8958333432674408,
"step": 259
},
{
"completion_length": 2015.0208740234375,
"epoch": 0.29714285714285715,
"grad_norm": 0.3966330885887146,
"kl": 0.04931640625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.002,
"reward": 0.059827481396496296,
"reward_std": 0.1531398855149746,
"rewards/cosine_scaled_reward": -0.17863771319389343,
"rewards/format_reward": 0.583333358168602,
"step": 260
},
{
"completion_length": 1767.354248046875,
"epoch": 0.29828571428571427,
"grad_norm": 0.4153224527835846,
"kl": 0.0479583740234375,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0019,
"reward": 0.11496858485043049,
"reward_std": 0.1553056426346302,
"rewards/cosine_scaled_reward": -0.18099842593073845,
"rewards/format_reward": 0.8125,
"step": 261
},
{
"completion_length": 1918.3958740234375,
"epoch": 0.29942857142857143,
"grad_norm": 0.7899470925331116,
"kl": 0.0538177490234375,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0022,
"reward": 0.0993692995980382,
"reward_std": 0.13485800474882126,
"rewards/cosine_scaled_reward": -0.17313212295994163,
"rewards/format_reward": 0.7291666716337204,
"step": 262
},
{
"completion_length": 1160.1250457763672,
"epoch": 0.30057142857142854,
"grad_norm": 0.463293194770813,
"kl": 0.032501220703125,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0013,
"reward": 0.1619243435561657,
"reward_std": 0.18417713977396488,
"rewards/cosine_scaled_reward": -0.16348575986921787,
"rewards/format_reward": 0.9375000149011612,
"step": 263
},
{
"completion_length": 1765.7083740234375,
"epoch": 0.3017142857142857,
"grad_norm": 0.4353474974632263,
"kl": 0.05523681640625,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0022,
"reward": 0.22621536999940872,
"reward_std": 0.20676996186375618,
"rewards/cosine_scaled_reward": -0.006626792252063751,
"rewards/format_reward": 0.8750000149011612,
"step": 264
},
{
"completion_length": 2156.8750915527344,
"epoch": 0.3028571428571429,
"grad_norm": 0.4260510206222534,
"kl": 0.06158447265625,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0025,
"reward": 0.1871240846812725,
"reward_std": 0.14801884070038795,
"rewards/cosine_scaled_reward": 0.006121315062046051,
"rewards/format_reward": 0.7083333395421505,
"step": 265
},
{
"completion_length": 2359.229248046875,
"epoch": 0.304,
"grad_norm": 0.6653827428817749,
"kl": 0.1218414306640625,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0049,
"reward": 0.05368301854468882,
"reward_std": 0.1643918640911579,
"rewards/cosine_scaled_reward": -0.15773088112473488,
"rewards/format_reward": 0.5208333544433117,
"step": 266
},
{
"completion_length": 2027.0833740234375,
"epoch": 0.30514285714285716,
"grad_norm": 0.3705967962741852,
"kl": 0.06707763671875,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0027,
"reward": 0.18167724087834358,
"reward_std": 0.1626145876944065,
"rewards/cosine_scaled_reward": -0.0044740717858076096,
"rewards/format_reward": 0.708333358168602,
"step": 267
},
{
"completion_length": 1997.5209045410156,
"epoch": 0.3062857142857143,
"grad_norm": 0.6208778619766235,
"kl": 0.051788330078125,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0021,
"reward": 0.13248581159859896,
"reward_std": 0.18944942951202393,
"rewards/cosine_scaled_reward": -0.1127938311547041,
"rewards/format_reward": 0.7291666716337204,
"step": 268
},
{
"completion_length": 1530.0208740234375,
"epoch": 0.30742857142857144,
"grad_norm": 0.7959035038948059,
"kl": 0.055450439453125,
"learning_rate": 5.688440441781398e-07,
"loss": 0.0022,
"reward": 0.16591855697333813,
"reward_std": 0.16789162531495094,
"rewards/cosine_scaled_reward": -0.0885553527623415,
"rewards/format_reward": 0.8125000298023224,
"step": 269
},
{
"completion_length": 985.6042022705078,
"epoch": 0.30857142857142855,
"grad_norm": 0.32174134254455566,
"kl": 0.01244354248046875,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0005,
"reward": 0.22263910062611103,
"reward_std": 0.1655977163463831,
"rewards/cosine_scaled_reward": -0.062186723574995995,
"rewards/format_reward": 0.9791666716337204,
"step": 270
},
{
"completion_length": 1733.0834045410156,
"epoch": 0.3097142857142857,
"grad_norm": 0.4459218680858612,
"kl": 0.060455322265625,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0024,
"reward": 0.204132997430861,
"reward_std": 0.1648325566202402,
"rewards/cosine_scaled_reward": -0.0029181139543652534,
"rewards/format_reward": 0.7916666716337204,
"step": 271
},
{
"completion_length": 2305.7709045410156,
"epoch": 0.31085714285714283,
"grad_norm": 1.647191047668457,
"kl": 0.1053466796875,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0042,
"reward": 0.09154188225511461,
"reward_std": 0.14747749641537666,
"rewards/cosine_scaled_reward": -0.12416816502809525,
"rewards/format_reward": 0.604166679084301,
"step": 272
},
{
"completion_length": 2009.3333740234375,
"epoch": 0.312,
"grad_norm": 0.4497400224208832,
"kl": 0.12188720703125,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0049,
"reward": 0.09924113377928734,
"reward_std": 0.1725939903408289,
"rewards/cosine_scaled_reward": -0.1677750125527382,
"rewards/format_reward": 0.7083333432674408,
"step": 273
},
{
"completion_length": 1482.4167175292969,
"epoch": 0.31314285714285717,
"grad_norm": 0.45461875200271606,
"kl": 0.025146484375,
"learning_rate": 5.531415671340826e-07,
"loss": 0.001,
"reward": 0.1875467412173748,
"reward_std": 0.15722386725246906,
"rewards/cosine_scaled_reward": -0.08861712459474802,
"rewards/format_reward": 0.8958333432674408,
"step": 274
},
{
"completion_length": 2088.229248046875,
"epoch": 0.3142857142857143,
"grad_norm": 0.8553512692451477,
"kl": 0.09869384765625,
"learning_rate": 5.5e-07,
"loss": 0.0039,
"reward": 0.08563580922782421,
"reward_std": 0.21715955808758736,
"rewards/cosine_scaled_reward": -0.15975168626755476,
"rewards/format_reward": 0.6458333507180214,
"step": 275
},
{
"completion_length": 1473.0417022705078,
"epoch": 0.31542857142857145,
"grad_norm": 0.6246925592422485,
"kl": 0.06494140625,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0026,
"reward": 0.2704322747886181,
"reward_std": 0.17775586992502213,
"rewards/cosine_scaled_reward": 0.07806644402444363,
"rewards/format_reward": 0.8750000149011612,
"step": 276
},
{
"completion_length": 1408.9583740234375,
"epoch": 0.31657142857142856,
"grad_norm": 0.7783192992210388,
"kl": 0.05496978759765625,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0022,
"reward": 0.20983506552875042,
"reward_std": 0.19946245104074478,
"rewards/cosine_scaled_reward": -0.04521218314766884,
"rewards/format_reward": 0.8958333432674408,
"step": 277
},
{
"completion_length": 1812.8959045410156,
"epoch": 0.3177142857142857,
"grad_norm": 0.9162847995758057,
"kl": 0.1117401123046875,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0045,
"reward": 0.1658716592937708,
"reward_std": 0.20760459825396538,
"rewards/cosine_scaled_reward": -0.04410050390288234,
"rewards/format_reward": 0.729166679084301,
"step": 278
},
{
"completion_length": 1827.1666870117188,
"epoch": 0.31885714285714284,
"grad_norm": 1.2598878145217896,
"kl": 0.12068939208984375,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0048,
"reward": 0.1136073712259531,
"reward_std": 0.2031346820294857,
"rewards/cosine_scaled_reward": -0.1427288819104433,
"rewards/format_reward": 0.7291666865348816,
"step": 279
},
{
"completion_length": 1207.6041717529297,
"epoch": 0.32,
"grad_norm": 0.3006845712661743,
"kl": 0.0462493896484375,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0018,
"reward": 0.25188567116856575,
"reward_std": 0.16300074756145477,
"rewards/cosine_scaled_reward": -0.016072510741651058,
"rewards/format_reward": 1.0,
"step": 280
},
{
"completion_length": 1508.6667175292969,
"epoch": 0.3211428571428571,
"grad_norm": 1.2887221574783325,
"kl": 0.072967529296875,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0029,
"reward": 0.19021394243463874,
"reward_std": 0.19989290460944176,
"rewards/cosine_scaled_reward": -0.0689934715628624,
"rewards/format_reward": 0.8750000149011612,
"step": 281
},
{
"completion_length": 1973.854248046875,
"epoch": 0.3222857142857143,
"grad_norm": 0.9689493775367737,
"kl": 0.158477783203125,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0063,
"reward": 0.13670184463262558,
"reward_std": 0.21750157698988914,
"rewards/cosine_scaled_reward": -0.11265072226524353,
"rewards/format_reward": 0.7500000149011612,
"step": 282
},
{
"completion_length": 1349.2083740234375,
"epoch": 0.32342857142857145,
"grad_norm": 1.58186936378479,
"kl": 0.15234375,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0061,
"reward": 0.15773484483361244,
"reward_std": 0.15496913716197014,
"rewards/cosine_scaled_reward": -0.15540640894323587,
"rewards/format_reward": 0.9166666716337204,
"step": 283
},
{
"completion_length": 1698.4791870117188,
"epoch": 0.32457142857142857,
"grad_norm": 1.2330901622772217,
"kl": 0.1558837890625,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0062,
"reward": 0.14318925887346268,
"reward_std": 0.2059284672141075,
"rewards/cosine_scaled_reward": -0.12442192807793617,
"rewards/format_reward": 0.7916666865348816,
"step": 284
},
{
"completion_length": 1879.6459045410156,
"epoch": 0.32571428571428573,
"grad_norm": 1.837733507156372,
"kl": 0.2252197265625,
"learning_rate": 5.186095868151436e-07,
"loss": 0.009,
"reward": 0.09095461945980787,
"reward_std": 0.16584222950041294,
"rewards/cosine_scaled_reward": -0.18948345258831978,
"rewards/format_reward": 0.7291666865348816,
"step": 285
},
{
"completion_length": 1449.5000305175781,
"epoch": 0.32685714285714285,
"grad_norm": 0.5687366724014282,
"kl": 0.106109619140625,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0042,
"reward": 0.24451042897999287,
"reward_std": 0.19233474135398865,
"rewards/cosine_scaled_reward": -0.009479179978370667,
"rewards/format_reward": 0.9583333432674408,
"step": 286
},
{
"completion_length": 1656.9583740234375,
"epoch": 0.328,
"grad_norm": 1.2167413234710693,
"kl": 0.271484375,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0109,
"reward": 0.12593204155564308,
"reward_std": 0.16647349670529366,
"rewards/cosine_scaled_reward": -0.14858145266771317,
"rewards/format_reward": 0.7708333432674408,
"step": 287
},
{
"completion_length": 1323.0000305175781,
"epoch": 0.3291428571428571,
"grad_norm": 0.9303487539291382,
"kl": 0.2335205078125,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0093,
"reward": 0.23953218385577202,
"reward_std": 0.1995762512087822,
"rewards/cosine_scaled_reward": 0.029527440055971965,
"rewards/format_reward": 0.8541666716337204,
"step": 288
},
{
"completion_length": 1701.0000610351562,
"epoch": 0.3302857142857143,
"grad_norm": 2.4383161067962646,
"kl": 0.27117919921875,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0109,
"reward": 0.15982208959758282,
"reward_std": 0.15560205932706594,
"rewards/cosine_scaled_reward": -0.07527756690979004,
"rewards/format_reward": 0.7500000298023224,
"step": 289
},
{
"completion_length": 1244.2292022705078,
"epoch": 0.3314285714285714,
"grad_norm": 1.5626213550567627,
"kl": 0.178863525390625,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0072,
"reward": 0.3573876768350601,
"reward_std": 0.1983262486755848,
"rewards/cosine_scaled_reward": 0.20004013180732727,
"rewards/format_reward": 0.9583333432674408,
"step": 290
},
{
"completion_length": 1349.1250610351562,
"epoch": 0.3325714285714286,
"grad_norm": 1.2207589149475098,
"kl": 0.31201171875,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0125,
"reward": 0.16633182391524315,
"reward_std": 0.15154998749494553,
"rewards/cosine_scaled_reward": -0.1112820515409112,
"rewards/format_reward": 0.8541666716337204,
"step": 291
},
{
"completion_length": 1625.3542022705078,
"epoch": 0.33371428571428574,
"grad_norm": 1.6357200145721436,
"kl": 0.2861785888671875,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0114,
"reward": 0.12437815871089697,
"reward_std": 0.20795376785099506,
"rewards/cosine_scaled_reward": -0.14810878783464432,
"rewards/format_reward": 0.770833358168602,
"step": 292
},
{
"completion_length": 1631.4167175292969,
"epoch": 0.33485714285714285,
"grad_norm": 1.7008156776428223,
"kl": 0.48681640625,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0195,
"reward": 0.12261722923722118,
"reward_std": 0.17399809882044792,
"rewards/cosine_scaled_reward": -0.1478295437991619,
"rewards/format_reward": 0.7708333432674408,
"step": 293
},
{
"completion_length": 1580.5625305175781,
"epoch": 0.336,
"grad_norm": 1.610378623008728,
"kl": 0.1676025390625,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0067,
"reward": 0.28274150006473064,
"reward_std": 0.1986326277256012,
"rewards/cosine_scaled_reward": 0.10465374775230885,
"rewards/format_reward": 0.8750000149011612,
"step": 294
},
{
"completion_length": 2265.6458740234375,
"epoch": 0.33714285714285713,
"grad_norm": 2.0899946689605713,
"kl": 0.60400390625,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0241,
"reward": 0.12508661299943924,
"reward_std": 0.15057932399213314,
"rewards/cosine_scaled_reward": -0.04951752349734306,
"rewards/format_reward": 0.5833333432674408,
"step": 295
},
{
"completion_length": 1289.0416870117188,
"epoch": 0.3382857142857143,
"grad_norm": 1.59931218624115,
"kl": 0.325592041015625,
"learning_rate": 4.842626371469149e-07,
"loss": 0.013,
"reward": 0.3012130483984947,
"reward_std": 0.27314068377017975,
"rewards/cosine_scaled_reward": 0.14410861767828465,
"rewards/format_reward": 0.8541666865348816,
"step": 296
},
{
"completion_length": 1420.2708892822266,
"epoch": 0.3394285714285714,
"grad_norm": 2.525320053100586,
"kl": 0.368133544921875,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0147,
"reward": 0.12896058335900307,
"reward_std": 0.15636470913887024,
"rewards/cosine_scaled_reward": -0.15718013513833284,
"rewards/format_reward": 0.8125000149011612,
"step": 297
},
{
"completion_length": 1947.3750305175781,
"epoch": 0.3405714285714286,
"grad_norm": 2.7730467319488525,
"kl": 0.7021484375,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0281,
"reward": 0.20079701766371727,
"reward_std": 0.2073996216058731,
"rewards/cosine_scaled_reward": -0.03276558732613921,
"rewards/format_reward": 0.833333358168602,
"step": 298
},
{
"completion_length": 1455.3958740234375,
"epoch": 0.3417142857142857,
"grad_norm": 2.607783317565918,
"kl": 0.4853515625,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0194,
"reward": 0.12638170272111893,
"reward_std": 0.17803113162517548,
"rewards/cosine_scaled_reward": -0.17176830675452948,
"rewards/format_reward": 0.833333358168602,
"step": 299
},
{
"completion_length": 1603.4166870117188,
"epoch": 0.34285714285714286,
"grad_norm": 3.5268094539642334,
"kl": 0.693359375,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0277,
"reward": 0.2142921146005392,
"reward_std": 0.14737887866795063,
"rewards/cosine_scaled_reward": 0.022697463631629944,
"rewards/format_reward": 0.770833358168602,
"step": 300
},
{
"completion_length": 1849.9791870117188,
"epoch": 0.344,
"grad_norm": 195.25047302246094,
"kl": 6.3701171875,
"learning_rate": 4.68766384637248e-07,
"loss": 0.2556,
"reward": 0.22336112707853317,
"reward_std": 0.2203882373869419,
"rewards/cosine_scaled_reward": 0.050891561433672905,
"rewards/format_reward": 0.7500000149011612,
"step": 301
},
{
"completion_length": 1800.541748046875,
"epoch": 0.34514285714285714,
"grad_norm": 2.098578453063965,
"kl": 0.9892578125,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0396,
"reward": 0.14433493767865002,
"reward_std": 0.15274815633893013,
"rewards/cosine_scaled_reward": -0.090041883289814,
"rewards/format_reward": 0.7291666865348816,
"step": 302
},
{
"completion_length": 1295.1042175292969,
"epoch": 0.3462857142857143,
"grad_norm": 1.0480232238769531,
"kl": 0.491546630859375,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0197,
"reward": 0.20194483920931816,
"reward_std": 0.13899757340550423,
"rewards/cosine_scaled_reward": -0.060742251574993134,
"rewards/format_reward": 0.8958333432674408,
"step": 303
},
{
"completion_length": 1140.8958587646484,
"epoch": 0.3474285714285714,
"grad_norm": 2.1697607040405273,
"kl": 0.461669921875,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0184,
"reward": 0.3258020356297493,
"reward_std": 0.24504756554961205,
"rewards/cosine_scaled_reward": 0.16624495573341846,
"rewards/format_reward": 0.9166666865348816,
"step": 304
},
{
"completion_length": 1822.1458435058594,
"epoch": 0.3485714285714286,
"grad_norm": 1.8082146644592285,
"kl": 0.8291015625,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0332,
"reward": 0.13778295274823904,
"reward_std": 0.19784759543836117,
"rewards/cosine_scaled_reward": -0.10065719857811928,
"rewards/format_reward": 0.7291666865348816,
"step": 305
},
{
"completion_length": 1604.3542175292969,
"epoch": 0.3497142857142857,
"grad_norm": 4.8597798347473145,
"kl": 0.73583984375,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0294,
"reward": 0.194507101085037,
"reward_std": 0.16609442234039307,
"rewards/cosine_scaled_reward": -0.0308070071041584,
"rewards/format_reward": 0.7916666865348816,
"step": 306
},
{
"completion_length": 1429.8750305175781,
"epoch": 0.35085714285714287,
"grad_norm": 1.5004690885543823,
"kl": 0.541259765625,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0216,
"reward": 0.34246205165982246,
"reward_std": 0.23809099197387695,
"rewards/cosine_scaled_reward": 0.23215805366635323,
"rewards/format_reward": 0.8541666716337204,
"step": 307
},
{
"completion_length": 1361.8333587646484,
"epoch": 0.352,
"grad_norm": 2.2864489555358887,
"kl": 0.517822265625,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0208,
"reward": 0.16233128495514393,
"reward_std": 0.1740443892776966,
"rewards/cosine_scaled_reward": -0.05880259908735752,
"rewards/format_reward": 0.7291666716337204,
"step": 308
},
{
"completion_length": 1388.812515258789,
"epoch": 0.35314285714285715,
"grad_norm": 1.1416103839874268,
"kl": 0.419677734375,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0168,
"reward": 0.1640561018139124,
"reward_std": 0.19008341804146767,
"rewards/cosine_scaled_reward": -0.10478888358920813,
"rewards/format_reward": 0.833333358168602,
"step": 309
},
{
"completion_length": 1815.3333892822266,
"epoch": 0.35428571428571426,
"grad_norm": 1.7600479125976562,
"kl": 0.5670166015625,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0227,
"reward": 0.18639850337058306,
"reward_std": 0.14701339416205883,
"rewards/cosine_scaled_reward": -0.038788361474871635,
"rewards/format_reward": 0.7916666865348816,
"step": 310
},
{
"completion_length": 1752.2500305175781,
"epoch": 0.3554285714285714,
"grad_norm": 1.5405431985855103,
"kl": 0.7080078125,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0283,
"reward": 0.20283151790499687,
"reward_std": 0.1671408899128437,
"rewards/cosine_scaled_reward": -0.00958926323801279,
"rewards/format_reward": 0.7916666865348816,
"step": 311
},
{
"completion_length": 1452.8333740234375,
"epoch": 0.3565714285714286,
"grad_norm": 1.9203969240188599,
"kl": 0.5329742431640625,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0214,
"reward": 0.1789357993984595,
"reward_std": 0.1928165927529335,
"rewards/cosine_scaled_reward": -0.06470100209116936,
"rewards/format_reward": 0.8125000149011612,
"step": 312
},
{
"completion_length": 1656.5625610351562,
"epoch": 0.3577142857142857,
"grad_norm": 3.925797462463379,
"kl": 0.5869140625,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0234,
"reward": 0.15515877585858107,
"reward_std": 0.16902573220431805,
"rewards/cosine_scaled_reward": -0.15649997163563967,
"rewards/format_reward": 0.8958333432674408,
"step": 313
},
{
"completion_length": 1387.2291870117188,
"epoch": 0.3588571428571429,
"grad_norm": 2.6238675117492676,
"kl": 0.428863525390625,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0172,
"reward": 0.22301865927875042,
"reward_std": 0.14535253681242466,
"rewards/cosine_scaled_reward": -0.030876588076353073,
"rewards/format_reward": 0.9166666716337204,
"step": 314
},
{
"completion_length": 1324.2083587646484,
"epoch": 0.36,
"grad_norm": 1.4441876411437988,
"kl": 0.6809234619140625,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0273,
"reward": 0.16800063382834196,
"reward_std": 0.16469407826662064,
"rewards/cosine_scaled_reward": -0.09765112772583961,
"rewards/format_reward": 0.8333333432674408,
"step": 315
},
{
"completion_length": 1827.1042175292969,
"epoch": 0.36114285714285715,
"grad_norm": 1.9154859781265259,
"kl": 0.8681640625,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0347,
"reward": 0.0611695135012269,
"reward_std": 0.20097459852695465,
"rewards/cosine_scaled_reward": -0.22885679081082344,
"rewards/format_reward": 0.6875000149011612,
"step": 316
},
{
"completion_length": 1761.1042175292969,
"epoch": 0.36228571428571427,
"grad_norm": 1.2220473289489746,
"kl": 0.858551025390625,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0343,
"reward": 0.2909087585285306,
"reward_std": 0.20921817421913147,
"rewards/cosine_scaled_reward": 0.12270434573292732,
"rewards/format_reward": 0.8750000149011612,
"step": 317
},
{
"completion_length": 1412.3333740234375,
"epoch": 0.36342857142857143,
"grad_norm": 1.2659395933151245,
"kl": 0.451171875,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0181,
"reward": 0.1977246394380927,
"reward_std": 0.16955609619617462,
"rewards/cosine_scaled_reward": -0.0062361303716897964,
"rewards/format_reward": 0.7500000149011612,
"step": 318
},
{
"completion_length": 1949.3750305175781,
"epoch": 0.36457142857142855,
"grad_norm": 3.923489809036255,
"kl": 0.63671875,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0255,
"reward": 0.13074796926230192,
"reward_std": 0.1879247985780239,
"rewards/cosine_scaled_reward": -0.09139842540025711,
"rewards/format_reward": 0.6875000149011612,
"step": 319
},
{
"completion_length": 1168.2292175292969,
"epoch": 0.3657142857142857,
"grad_norm": 1.0396791696548462,
"kl": 0.1934814453125,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0077,
"reward": 0.22300215438008308,
"reward_std": 0.2135285884141922,
"rewards/cosine_scaled_reward": -0.06148634012788534,
"rewards/format_reward": 0.9791666716337204,
"step": 320
},
{
"completion_length": 1763.3333435058594,
"epoch": 0.3668571428571429,
"grad_norm": 1.4253339767456055,
"kl": 0.74658203125,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0299,
"reward": 0.1205808836966753,
"reward_std": 0.17672885209321976,
"rewards/cosine_scaled_reward": -0.11066987551748753,
"rewards/format_reward": 0.6875000149011612,
"step": 321
},
{
"completion_length": 1481.2917175292969,
"epoch": 0.368,
"grad_norm": 1.548012614250183,
"kl": 0.4468994140625,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0179,
"reward": 0.2291913628578186,
"reward_std": 0.14006879553198814,
"rewards/cosine_scaled_reward": 0.01431015320122242,
"rewards/format_reward": 0.8541666865348816,
"step": 322
},
{
"completion_length": 1383.9583740234375,
"epoch": 0.36914285714285716,
"grad_norm": 1.8068033456802368,
"kl": 0.530517578125,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0212,
"reward": 0.27303827553987503,
"reward_std": 0.17342529818415642,
"rewards/cosine_scaled_reward": 0.07235794328153133,
"rewards/format_reward": 0.895833358168602,
"step": 323
},
{
"completion_length": 1631.8333740234375,
"epoch": 0.3702857142857143,
"grad_norm": 3.112732410430908,
"kl": 0.5965576171875,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0238,
"reward": 0.11710180155932903,
"reward_std": 0.14361269772052765,
"rewards/cosine_scaled_reward": -0.21200886741280556,
"rewards/format_reward": 0.8750000149011612,
"step": 324
},
{
"completion_length": 967.8541717529297,
"epoch": 0.37142857142857144,
"grad_norm": 1.7522660493850708,
"kl": 0.22650146484375,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0091,
"reward": 0.2394350841641426,
"reward_std": 0.1827605739235878,
"rewards/cosine_scaled_reward": 0.019153601489961147,
"rewards/format_reward": 0.8750000149011612,
"step": 325
},
{
"completion_length": 1521.2708587646484,
"epoch": 0.37257142857142855,
"grad_norm": 3.303557872772217,
"kl": 0.71337890625,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0285,
"reward": 0.16831985116004944,
"reward_std": 0.16597898304462433,
"rewards/cosine_scaled_reward": -0.10751725360751152,
"rewards/format_reward": 0.8541666716337204,
"step": 326
},
{
"completion_length": 1820.0625610351562,
"epoch": 0.3737142857142857,
"grad_norm": 2.2156002521514893,
"kl": 1.386962890625,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0555,
"reward": 0.1753358980640769,
"reward_std": 0.13772335462272167,
"rewards/cosine_scaled_reward": -0.009533978998661041,
"rewards/format_reward": 0.6875,
"step": 327
},
{
"completion_length": 1580.7708740234375,
"epoch": 0.37485714285714283,
"grad_norm": 2.2257449626922607,
"kl": 0.70751953125,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0283,
"reward": 0.20990237966179848,
"reward_std": 0.18772607296705246,
"rewards/cosine_scaled_reward": 0.03089301474392414,
"rewards/format_reward": 0.7500000149011612,
"step": 328
},
{
"completion_length": 1922.5209045410156,
"epoch": 0.376,
"grad_norm": 2.8192689418792725,
"kl": 1.236328125,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0494,
"reward": 0.1401547589339316,
"reward_std": 0.18911270424723625,
"rewards/cosine_scaled_reward": -0.07860468700528145,
"rewards/format_reward": 0.6875000149011612,
"step": 329
},
{
"completion_length": 2099.1875610351562,
"epoch": 0.37714285714285717,
"grad_norm": 2.138005018234253,
"kl": 1.3330078125,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0532,
"reward": 0.13079076260328293,
"reward_std": 0.25324463099241257,
"rewards/cosine_scaled_reward": -0.06187394913285971,
"rewards/format_reward": 0.6250000298023224,
"step": 330
},
{
"completion_length": 1694.0416870117188,
"epoch": 0.3782857142857143,
"grad_norm": 4.427945613861084,
"kl": 1.271484375,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0509,
"reward": 0.06919177388772368,
"reward_std": 0.14648743718862534,
"rewards/cosine_scaled_reward": -0.2538422755897045,
"rewards/format_reward": 0.770833358168602,
"step": 331
},
{
"completion_length": 1520.4166870117188,
"epoch": 0.37942857142857145,
"grad_norm": 2.170893669128418,
"kl": 1.00927734375,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0404,
"reward": 0.2010207176208496,
"reward_std": 0.22100840136408806,
"rewards/cosine_scaled_reward": -0.01634824648499489,
"rewards/format_reward": 0.7916666716337204,
"step": 332
},
{
"completion_length": 1651.2083435058594,
"epoch": 0.38057142857142856,
"grad_norm": 3.6611242294311523,
"kl": 1.2607421875,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0505,
"reward": 0.09817091876175255,
"reward_std": 0.15193179063498974,
"rewards/cosine_scaled_reward": -0.21044551581144333,
"rewards/format_reward": 0.7916666716337204,
"step": 333
},
{
"completion_length": 1986.229248046875,
"epoch": 0.38171428571428573,
"grad_norm": 3.5136094093322754,
"kl": 1.3896484375,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0556,
"reward": 0.17547175474464893,
"reward_std": 0.2079135961830616,
"rewards/cosine_scaled_reward": -0.031742025166749954,
"rewards/format_reward": 0.7291667014360428,
"step": 334
},
{
"completion_length": 1639.3750610351562,
"epoch": 0.38285714285714284,
"grad_norm": 4.113811016082764,
"kl": 1.249267578125,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0501,
"reward": 0.14984197542071342,
"reward_std": 0.16426498722285032,
"rewards/cosine_scaled_reward": -0.14330823719501495,
"rewards/format_reward": 0.8541666865348816,
"step": 335
},
{
"completion_length": 1604.9583740234375,
"epoch": 0.384,
"grad_norm": 2.564976215362549,
"kl": 0.9404296875,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0376,
"reward": 0.08363019116222858,
"reward_std": 0.13052179291844368,
"rewards/cosine_scaled_reward": -0.2588311657309532,
"rewards/format_reward": 0.833333358168602,
"step": 336
},
{
"completion_length": 1238.4583740234375,
"epoch": 0.3851428571428571,
"grad_norm": 2.2139716148376465,
"kl": 0.671295166015625,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0269,
"reward": 0.17029657028615475,
"reward_std": 0.23339306563138962,
"rewards/cosine_scaled_reward": -0.097343516536057,
"rewards/format_reward": 0.833333358168602,
"step": 337
},
{
"completion_length": 2030.3541870117188,
"epoch": 0.3862857142857143,
"grad_norm": 2.1332132816314697,
"kl": 1.28515625,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0514,
"reward": 0.0918416610584245,
"reward_std": 0.19109328091144562,
"rewards/cosine_scaled_reward": -0.16856661438941956,
"rewards/format_reward": 0.6875000074505806,
"step": 338
},
{
"completion_length": 2271.2500610351562,
"epoch": 0.38742857142857146,
"grad_norm": 1.892738938331604,
"kl": 1.158203125,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0462,
"reward": 0.08900500182062387,
"reward_std": 0.21380429714918137,
"rewards/cosine_scaled_reward": -0.12333061918616295,
"rewards/format_reward": 0.5833333656191826,
"step": 339
},
{
"completion_length": 1519.2084045410156,
"epoch": 0.38857142857142857,
"grad_norm": 2.038071870803833,
"kl": 0.5703125,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0228,
"reward": 0.24512270092964172,
"reward_std": 0.18805953487753868,
"rewards/cosine_scaled_reward": 0.04649870842695236,
"rewards/format_reward": 0.8541666865348816,
"step": 340
},
{
"completion_length": 1584.0417175292969,
"epoch": 0.38971428571428574,
"grad_norm": 2.7954156398773193,
"kl": 0.666015625,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0267,
"reward": 0.2796506742015481,
"reward_std": 0.18307143822312355,
"rewards/cosine_scaled_reward": 0.09590147994458675,
"rewards/format_reward": 0.8541666865348816,
"step": 341
},
{
"completion_length": 1688.1042175292969,
"epoch": 0.39085714285714285,
"grad_norm": 2.481614589691162,
"kl": 0.646728515625,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0259,
"reward": 0.13232821132987738,
"reward_std": 0.15718812122941017,
"rewards/cosine_scaled_reward": -0.10951092094182968,
"rewards/format_reward": 0.7291666865348816,
"step": 342
},
{
"completion_length": 2052.0625915527344,
"epoch": 0.392,
"grad_norm": 1.4259511232376099,
"kl": 1.07421875,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0429,
"reward": 0.1814631875604391,
"reward_std": 0.22088013961911201,
"rewards/cosine_scaled_reward": -0.04379495978355408,
"rewards/format_reward": 0.770833358168602,
"step": 343
},
{
"completion_length": 1663.125,
"epoch": 0.3931428571428571,
"grad_norm": 1.6074610948562622,
"kl": 0.69775390625,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0279,
"reward": 0.21839727461338043,
"reward_std": 0.15816613100469112,
"rewards/cosine_scaled_reward": 0.029275711625814438,
"rewards/format_reward": 0.770833358168602,
"step": 344
},
{
"completion_length": 1546.8333587646484,
"epoch": 0.3942857142857143,
"grad_norm": 1.4356237649917603,
"kl": 0.76837158203125,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0308,
"reward": 0.2311046477407217,
"reward_std": 0.16006076335906982,
"rewards/cosine_scaled_reward": 0.007090110331773758,
"rewards/format_reward": 0.8750000298023224,
"step": 345
},
{
"completion_length": 2192.6459350585938,
"epoch": 0.3954285714285714,
"grad_norm": 3.3563661575317383,
"kl": 1.0556640625,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0422,
"reward": 0.08254441898316145,
"reward_std": 0.21427064761519432,
"rewards/cosine_scaled_reward": -0.1869775615632534,
"rewards/format_reward": 0.6875000149011612,
"step": 346
},
{
"completion_length": 1711.5834350585938,
"epoch": 0.3965714285714286,
"grad_norm": 2.4092659950256348,
"kl": 0.818359375,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0328,
"reward": 0.3048556298017502,
"reward_std": 0.24222856387495995,
"rewards/cosine_scaled_reward": 0.14202131098136306,
"rewards/format_reward": 0.8750000149011612,
"step": 347
},
{
"completion_length": 1510.9583435058594,
"epoch": 0.3977142857142857,
"grad_norm": 2.6288950443267822,
"kl": 0.750732421875,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.03,
"reward": 0.18389190919697285,
"reward_std": 0.22533446922898293,
"rewards/cosine_scaled_reward": -0.06304232217371464,
"rewards/format_reward": 0.8333333432674408,
"step": 348
},
{
"completion_length": 1343.7083587646484,
"epoch": 0.39885714285714285,
"grad_norm": 1.5651475191116333,
"kl": 0.58001708984375,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0232,
"reward": 0.3366018421947956,
"reward_std": 0.2321232110261917,
"rewards/cosine_scaled_reward": 0.21495839580893517,
"rewards/format_reward": 0.8541666865348816,
"step": 349
},
{
"completion_length": 1627.8750610351562,
"epoch": 0.4,
"grad_norm": 1.5066512823104858,
"kl": 0.85693359375,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0343,
"reward": 0.27388138696551323,
"reward_std": 0.2236754074692726,
"rewards/cosine_scaled_reward": 0.14884734898805618,
"rewards/format_reward": 0.7500000149011612,
"step": 350
},
{
"completion_length": 1225.2916870117188,
"epoch": 0.40114285714285713,
"grad_norm": 1.9423860311508179,
"kl": 0.312164306640625,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0124,
"reward": 0.1787915969034657,
"reward_std": 0.138715498149395,
"rewards/cosine_scaled_reward": -0.0604798283893615,
"rewards/format_reward": 0.7916666865348816,
"step": 351
},
{
"completion_length": 1779.354248046875,
"epoch": 0.4022857142857143,
"grad_norm": 2.395528793334961,
"kl": 1.109375,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0444,
"reward": 0.11777728889137506,
"reward_std": 0.20832915045320988,
"rewards/cosine_scaled_reward": -0.16191274672746658,
"rewards/format_reward": 0.770833358168602,
"step": 352
},
{
"completion_length": 1802.1875305175781,
"epoch": 0.4034285714285714,
"grad_norm": 3.2702293395996094,
"kl": 0.888671875,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0355,
"reward": 0.14973015896975994,
"reward_std": 0.2422914244234562,
"rewards/cosine_scaled_reward": -0.08677250519394875,
"rewards/format_reward": 0.7500000149011612,
"step": 353
},
{
"completion_length": 1341.9166870117188,
"epoch": 0.4045714285714286,
"grad_norm": 2.3067736625671387,
"kl": 0.63531494140625,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0254,
"reward": 0.08647960424423218,
"reward_std": 0.14325924962759018,
"rewards/cosine_scaled_reward": -0.2389773204922676,
"rewards/format_reward": 0.8125000149011612,
"step": 354
},
{
"completion_length": 1209.0000305175781,
"epoch": 0.4057142857142857,
"grad_norm": 2.9223105907440186,
"kl": 0.60986328125,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0244,
"reward": 0.21037685312330723,
"reward_std": 0.15928080305457115,
"rewards/cosine_scaled_reward": -0.018377395812422037,
"rewards/format_reward": 0.833333358168602,
"step": 355
},
{
"completion_length": 1625.2292175292969,
"epoch": 0.40685714285714286,
"grad_norm": 2.497197151184082,
"kl": 1.08203125,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0433,
"reward": 0.15635429695248604,
"reward_std": 0.18079080618917942,
"rewards/cosine_scaled_reward": -0.09902848303318024,
"rewards/format_reward": 0.7916666865348816,
"step": 356
},
{
"completion_length": 1548.1250457763672,
"epoch": 0.408,
"grad_norm": 2.1372578144073486,
"kl": 1.04052734375,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0416,
"reward": 0.1531627606600523,
"reward_std": 0.13187414780259132,
"rewards/cosine_scaled_reward": -0.15889397263526917,
"rewards/format_reward": 0.8958333432674408,
"step": 357
},
{
"completion_length": 1938.7500915527344,
"epoch": 0.40914285714285714,
"grad_norm": 3.2893924713134766,
"kl": 1.296875,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.052,
"reward": 0.23603218793869019,
"reward_std": 0.2104952149093151,
"rewards/cosine_scaled_reward": 0.06818200647830963,
"rewards/format_reward": 0.7708333432674408,
"step": 358
},
{
"completion_length": 1526.7292175292969,
"epoch": 0.4102857142857143,
"grad_norm": 1.6249175071716309,
"kl": 0.94677734375,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.038,
"reward": 0.14598742313683033,
"reward_std": 0.16005707904696465,
"rewards/cosine_scaled_reward": -0.08985930308699608,
"rewards/format_reward": 0.7291666865348816,
"step": 359
},
{
"completion_length": 1373.6459045410156,
"epoch": 0.4114285714285714,
"grad_norm": 2.30932354927063,
"kl": 1.03515625,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0414,
"reward": 0.20623124949634075,
"reward_std": 0.24741016328334808,
"rewards/cosine_scaled_reward": -0.009535698220133781,
"rewards/format_reward": 0.8125000149011612,
"step": 360
},
{
"completion_length": 1704.8750457763672,
"epoch": 0.4125714285714286,
"grad_norm": 1.839920163154602,
"kl": 1.0859375,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0434,
"reward": 0.12991097196936607,
"reward_std": 0.16597579792141914,
"rewards/cosine_scaled_reward": -0.13096854276955128,
"rewards/format_reward": 0.7500000149011612,
"step": 361
},
{
"completion_length": 1567.2083740234375,
"epoch": 0.4137142857142857,
"grad_norm": 1.8752156496047974,
"kl": 1.0390625,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0416,
"reward": 0.1774279922246933,
"reward_std": 0.20492257550358772,
"rewards/cosine_scaled_reward": -0.10070006363093853,
"rewards/format_reward": 0.8750000149011612,
"step": 362
},
{
"completion_length": 1532.4584350585938,
"epoch": 0.41485714285714287,
"grad_norm": 2.349215507507324,
"kl": 1.103759765625,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0442,
"reward": 0.12260803673416376,
"reward_std": 0.21350538730621338,
"rewards/cosine_scaled_reward": -0.17045626137405634,
"rewards/format_reward": 0.8125000149011612,
"step": 363
},
{
"completion_length": 1452.5417175292969,
"epoch": 0.416,
"grad_norm": 1.401031255722046,
"kl": 1.0751953125,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.043,
"reward": 0.16770456731319427,
"reward_std": 0.1539650922641158,
"rewards/cosine_scaled_reward": -0.10999439284205437,
"rewards/format_reward": 0.8541666865348816,
"step": 364
},
{
"completion_length": 2189.7084350585938,
"epoch": 0.41714285714285715,
"grad_norm": 3.1197776794433594,
"kl": 1.63671875,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0654,
"reward": 0.011583839543163776,
"reward_std": 0.16744238138198853,
"rewards/cosine_scaled_reward": -0.26333725824952126,
"rewards/format_reward": 0.5625,
"step": 365
},
{
"completion_length": 1772.6666870117188,
"epoch": 0.41828571428571426,
"grad_norm": 1.569739818572998,
"kl": 0.8743896484375,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0349,
"reward": 0.18236286379396915,
"reward_std": 0.204185388982296,
"rewards/cosine_scaled_reward": -0.01714538410305977,
"rewards/format_reward": 0.7291667014360428,
"step": 366
},
{
"completion_length": 1263.0833740234375,
"epoch": 0.41942857142857143,
"grad_norm": 1.6131809949874878,
"kl": 0.82684326171875,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0331,
"reward": 0.24072659714147449,
"reward_std": 0.16533087193965912,
"rewards/cosine_scaled_reward": 0.038546825759112835,
"rewards/format_reward": 0.8541666865348816,
"step": 367
},
{
"completion_length": 1519.979232788086,
"epoch": 0.4205714285714286,
"grad_norm": 3.1444971561431885,
"kl": 0.759765625,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0304,
"reward": 0.20653121452778578,
"reward_std": 0.18122152984142303,
"rewards/cosine_scaled_reward": -0.00040038255974650383,
"rewards/format_reward": 0.7916666865348816,
"step": 368
},
{
"completion_length": 1517.1875305175781,
"epoch": 0.4217142857142857,
"grad_norm": 1.515045166015625,
"kl": 0.861328125,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0344,
"reward": 0.2802426964044571,
"reward_std": 0.1838936284184456,
"rewards/cosine_scaled_reward": 0.1429410008713603,
"rewards/format_reward": 0.7916666865348816,
"step": 369
},
{
"completion_length": 1656.1458740234375,
"epoch": 0.4228571428571429,
"grad_norm": 3.1206541061401367,
"kl": 0.80908203125,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0323,
"reward": 0.0936688520014286,
"reward_std": 0.13261800445616245,
"rewards/cosine_scaled_reward": -0.2378309927880764,
"rewards/format_reward": 0.8333333432674408,
"step": 370
},
{
"completion_length": 1568.1875305175781,
"epoch": 0.424,
"grad_norm": 2.3558554649353027,
"kl": 0.93798828125,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0375,
"reward": 0.13434469606727362,
"reward_std": 0.16923817060887814,
"rewards/cosine_scaled_reward": -0.14115899708122015,
"rewards/format_reward": 0.7916666716337204,
"step": 371
},
{
"completion_length": 1899.3333435058594,
"epoch": 0.42514285714285716,
"grad_norm": 2.514725923538208,
"kl": 1.216796875,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0486,
"reward": 0.13189218938350677,
"reward_std": 0.11921382136642933,
"rewards/cosine_scaled_reward": -0.14287901669740677,
"rewards/format_reward": 0.7916667014360428,
"step": 372
},
{
"completion_length": 1694.166748046875,
"epoch": 0.42628571428571427,
"grad_norm": 2.2729876041412354,
"kl": 1.076171875,
"learning_rate": 2.655868138008171e-07,
"loss": 0.043,
"reward": 0.13227892480790615,
"reward_std": 0.18314684182405472,
"rewards/cosine_scaled_reward": -0.15385251492261887,
"rewards/format_reward": 0.8125000149011612,
"step": 373
},
{
"completion_length": 1518.2917175292969,
"epoch": 0.42742857142857144,
"grad_norm": 1.470110535621643,
"kl": 0.7646484375,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0306,
"reward": 0.24618086963891983,
"reward_std": 0.18493768386542797,
"rewards/cosine_scaled_reward": 0.02095278911292553,
"rewards/format_reward": 0.8958333432674408,
"step": 374
},
{
"completion_length": 1516.7084045410156,
"epoch": 0.42857142857142855,
"grad_norm": 1.2037338018417358,
"kl": 0.9248046875,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.037,
"reward": 0.19765574857592583,
"reward_std": 0.23449427634477615,
"rewards/cosine_scaled_reward": -0.02087587956339121,
"rewards/format_reward": 0.7916666716337204,
"step": 375
},
{
"completion_length": 1457.7083740234375,
"epoch": 0.4297142857142857,
"grad_norm": 2.412107229232788,
"kl": 0.76611328125,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0306,
"reward": 0.2110733650624752,
"reward_std": 0.24278932064771652,
"rewards/cosine_scaled_reward": -0.01106532383710146,
"rewards/format_reward": 0.833333358168602,
"step": 376
},
{
"completion_length": 1775.1458740234375,
"epoch": 0.4308571428571429,
"grad_norm": 1.3878467082977295,
"kl": 1.2080078125,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0483,
"reward": 0.1983587248250842,
"reward_std": 0.2635771445930004,
"rewards/cosine_scaled_reward": -0.025547289289534092,
"rewards/format_reward": 0.7916666865348816,
"step": 377
},
{
"completion_length": 1364.0625305175781,
"epoch": 0.432,
"grad_norm": 2.308479070663452,
"kl": 0.650390625,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.026,
"reward": 0.09404418990015984,
"reward_std": 0.1384107507765293,
"rewards/cosine_scaled_reward": -0.2571399100124836,
"rewards/format_reward": 0.8750000149011612,
"step": 378
},
{
"completion_length": 1627.7916870117188,
"epoch": 0.43314285714285716,
"grad_norm": 1.8004807233810425,
"kl": 0.79296875,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0317,
"reward": 0.24452932178974152,
"reward_std": 0.20264879241585732,
"rewards/cosine_scaled_reward": 0.02817649580538273,
"rewards/format_reward": 0.8750000149011612,
"step": 379
},
{
"completion_length": 1773.5208740234375,
"epoch": 0.4342857142857143,
"grad_norm": 2.1422038078308105,
"kl": 1.15234375,
"learning_rate": 2.488912271385139e-07,
"loss": 0.046,
"reward": 0.11856007762253284,
"reward_std": 0.14229279570281506,
"rewards/cosine_scaled_reward": -0.1719975546002388,
"rewards/format_reward": 0.7916666865348816,
"step": 380
},
{
"completion_length": 1725.7500915527344,
"epoch": 0.43542857142857144,
"grad_norm": 2.1980226039886475,
"kl": 0.9560546875,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0381,
"reward": 0.14323885599151254,
"reward_std": 0.21406007558107376,
"rewards/cosine_scaled_reward": -0.1250370437046513,
"rewards/format_reward": 0.7916666865348816,
"step": 381
},
{
"completion_length": 1673.3333740234375,
"epoch": 0.43657142857142855,
"grad_norm": 1.6990046501159668,
"kl": 1.20703125,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0484,
"reward": 0.11549163609743118,
"reward_std": 0.19104652479290962,
"rewards/cosine_scaled_reward": -0.16341273672878742,
"rewards/format_reward": 0.7708333432674408,
"step": 382
},
{
"completion_length": 2062.479248046875,
"epoch": 0.4377142857142857,
"grad_norm": 2.1265945434570312,
"kl": 1.5693359375,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0627,
"reward": 0.14214371237903833,
"reward_std": 0.21174047514796257,
"rewards/cosine_scaled_reward": -0.05033590830862522,
"rewards/format_reward": 0.6458333730697632,
"step": 383
},
{
"completion_length": 1379.2708740234375,
"epoch": 0.43885714285714283,
"grad_norm": 2.062028408050537,
"kl": 0.75439453125,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0302,
"reward": 0.07300803670659661,
"reward_std": 0.12315612472593784,
"rewards/cosine_scaled_reward": -0.2674658801406622,
"rewards/format_reward": 0.8125000149011612,
"step": 384
},
{
"completion_length": 1687.6250762939453,
"epoch": 0.44,
"grad_norm": 2.788571834564209,
"kl": 0.99072265625,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0395,
"reward": 0.20402609836310148,
"reward_std": 0.2159460373222828,
"rewards/cosine_scaled_reward": -0.028315742500126362,
"rewards/format_reward": 0.8333333432674408,
"step": 385
},
{
"completion_length": 1615.9583740234375,
"epoch": 0.44114285714285717,
"grad_norm": 1.576268196105957,
"kl": 1.1650390625,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0467,
"reward": 0.17687237821519375,
"reward_std": 0.1281549371778965,
"rewards/cosine_scaled_reward": -0.10758153721690178,
"rewards/format_reward": 0.8958333432674408,
"step": 386
},
{
"completion_length": 1551.5625610351562,
"epoch": 0.4422857142857143,
"grad_norm": 1.610855221748352,
"kl": 0.573486328125,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0229,
"reward": 0.1070600375533104,
"reward_std": 0.16446635872125626,
"rewards/cosine_scaled_reward": -0.22158217430114746,
"rewards/format_reward": 0.8541666865348816,
"step": 387
},
{
"completion_length": 1435.6458435058594,
"epoch": 0.44342857142857145,
"grad_norm": 2.12188720703125,
"kl": 0.685302734375,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0274,
"reward": 0.20380490552634,
"reward_std": 0.11574576422572136,
"rewards/cosine_scaled_reward": -0.036669282941147685,
"rewards/format_reward": 0.8541666865348816,
"step": 388
},
{
"completion_length": 1297.0417175292969,
"epoch": 0.44457142857142856,
"grad_norm": 1.524010181427002,
"kl": 0.875732421875,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.035,
"reward": 0.2976585365831852,
"reward_std": 0.23040159419178963,
"rewards/cosine_scaled_reward": 0.12910030595958233,
"rewards/format_reward": 0.8750000149011612,
"step": 389
},
{
"completion_length": 1280.8541717529297,
"epoch": 0.44571428571428573,
"grad_norm": 0.8724046349525452,
"kl": 0.746826171875,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0298,
"reward": 0.1727432645857334,
"reward_std": 0.17489107139408588,
"rewards/cosine_scaled_reward": -0.11648351605981588,
"rewards/format_reward": 0.895833358168602,
"step": 390
},
{
"completion_length": 1218.6458740234375,
"epoch": 0.44685714285714284,
"grad_norm": 1.9020179510116577,
"kl": 0.556396484375,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0223,
"reward": 0.13444999419152737,
"reward_std": 0.1418241187930107,
"rewards/cosine_scaled_reward": -0.2053442131727934,
"rewards/format_reward": 0.9166666865348816,
"step": 391
},
{
"completion_length": 1699.0833740234375,
"epoch": 0.448,
"grad_norm": 3.0778512954711914,
"kl": 0.791015625,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0316,
"reward": 0.23890820518136024,
"reward_std": 0.3118077628314495,
"rewards/cosine_scaled_reward": 0.07110257190652192,
"rewards/format_reward": 0.7708333432674408,
"step": 392
},
{
"completion_length": 1719.2916870117188,
"epoch": 0.4491428571428571,
"grad_norm": 1.7322001457214355,
"kl": 1.0498046875,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.042,
"reward": 0.2058930192142725,
"reward_std": 0.2754768989980221,
"rewards/cosine_scaled_reward": 0.014654617756605148,
"rewards/format_reward": 0.7500000298023224,
"step": 393
},
{
"completion_length": 1581.8958740234375,
"epoch": 0.4502857142857143,
"grad_norm": 1.7970198392868042,
"kl": 0.75732421875,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0303,
"reward": 0.1278714146465063,
"reward_std": 0.16577338986098766,
"rewards/cosine_scaled_reward": -0.18303980166092515,
"rewards/format_reward": 0.8541666716337204,
"step": 394
},
{
"completion_length": 1615.8542175292969,
"epoch": 0.4514285714285714,
"grad_norm": 1.6947509050369263,
"kl": 0.90771484375,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0363,
"reward": 0.16298719588667154,
"reward_std": 0.1734189111739397,
"rewards/cosine_scaled_reward": -0.09799596574157476,
"rewards/format_reward": 0.8125000149011612,
"step": 395
},
{
"completion_length": 1059.1041870117188,
"epoch": 0.45257142857142857,
"grad_norm": 1.1382616758346558,
"kl": 0.41558837890625,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0166,
"reward": 0.20291254110634327,
"reward_std": 0.1326997596770525,
"rewards/cosine_scaled_reward": -0.04992395639419556,
"rewards/format_reward": 0.8750000149011612,
"step": 396
},
{
"completion_length": 1636.5625305175781,
"epoch": 0.45371428571428574,
"grad_norm": 1.6751741170883179,
"kl": 1.0986328125,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.044,
"reward": 0.34289546124637127,
"reward_std": 0.18598736822605133,
"rewards/cosine_scaled_reward": 0.26703778095543385,
"rewards/format_reward": 0.7500000149011612,
"step": 397
},
{
"completion_length": 1436.0833435058594,
"epoch": 0.45485714285714285,
"grad_norm": 1.3158280849456787,
"kl": 0.488494873046875,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0195,
"reward": 0.18827892653644085,
"reward_std": 0.18240927904844284,
"rewards/cosine_scaled_reward": -0.06533949635922909,
"rewards/format_reward": 0.8541666716337204,
"step": 398
},
{
"completion_length": 1633.2500610351562,
"epoch": 0.456,
"grad_norm": 1.3960447311401367,
"kl": 0.5896148681640625,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0236,
"reward": 0.15800490105175413,
"reward_std": 0.14584489725530148,
"rewards/cosine_scaled_reward": -0.07036726316437125,
"rewards/format_reward": 0.7500000149011612,
"step": 399
},
{
"completion_length": 1449.3333587646484,
"epoch": 0.45714285714285713,
"grad_norm": 2.858807325363159,
"kl": 0.79345703125,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0317,
"reward": 0.17510544694960117,
"reward_std": 0.20562008023262024,
"rewards/cosine_scaled_reward": -0.060673171654343605,
"rewards/format_reward": 0.7916666716337204,
"step": 400
},
{
"completion_length": 1245.7083587646484,
"epoch": 0.4582857142857143,
"grad_norm": 3.399758815765381,
"kl": 0.52587890625,
"learning_rate": 2.032690407508949e-07,
"loss": 0.021,
"reward": 0.2145949751138687,
"reward_std": 0.17422041855752468,
"rewards/cosine_scaled_reward": 0.00034935586154460907,
"rewards/format_reward": 0.8125000149011612,
"step": 401
},
{
"completion_length": 2146.5625610351562,
"epoch": 0.4594285714285714,
"grad_norm": 2.2153425216674805,
"kl": 1.408203125,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0564,
"reward": 0.06373792514204979,
"reward_std": 0.11548986099660397,
"rewards/cosine_scaled_reward": -0.2942846156656742,
"rewards/format_reward": 0.8333333730697632,
"step": 402
},
{
"completion_length": 1795.0833435058594,
"epoch": 0.4605714285714286,
"grad_norm": 2.41886043548584,
"kl": 0.943359375,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0378,
"reward": 0.12751809041947126,
"reward_std": 0.174501184374094,
"rewards/cosine_scaled_reward": -0.1311726775020361,
"rewards/format_reward": 0.7500000149011612,
"step": 403
},
{
"completion_length": 1662.2500610351562,
"epoch": 0.4617142857142857,
"grad_norm": 1.9953243732452393,
"kl": 0.9521484375,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0381,
"reward": 0.21517368033528328,
"reward_std": 0.18545049242675304,
"rewards/cosine_scaled_reward": 0.005411209538578987,
"rewards/format_reward": 0.8125000149011612,
"step": 404
},
{
"completion_length": 1612.9167175292969,
"epoch": 0.46285714285714286,
"grad_norm": 1.684091567993164,
"kl": 0.84716796875,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0339,
"reward": 0.2171931341290474,
"reward_std": 0.18458830192685127,
"rewards/cosine_scaled_reward": 0.0182991623878479,
"rewards/format_reward": 0.7916666865348816,
"step": 405
},
{
"completion_length": 1634.7709045410156,
"epoch": 0.464,
"grad_norm": 2.3493168354034424,
"kl": 1.029296875,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0412,
"reward": 0.19101523607969284,
"reward_std": 0.16082188487052917,
"rewards/cosine_scaled_reward": -0.06326993182301521,
"rewards/format_reward": 0.8541666865348816,
"step": 406
},
{
"completion_length": 1121.5416870117188,
"epoch": 0.46514285714285714,
"grad_norm": 2.3104114532470703,
"kl": 0.4576416015625,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0183,
"reward": 0.21706481464207172,
"reward_std": 0.14366690441966057,
"rewards/cosine_scaled_reward": -0.025802362710237503,
"rewards/format_reward": 0.8750000149011612,
"step": 407
},
{
"completion_length": 1074.9791870117188,
"epoch": 0.4662857142857143,
"grad_norm": 1.874539852142334,
"kl": 0.44500732421875,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0178,
"reward": 0.23207461088895798,
"reward_std": 0.20729456096887589,
"rewards/cosine_scaled_reward": -0.004256272688508034,
"rewards/format_reward": 0.8958333432674408,
"step": 408
},
{
"completion_length": 1555.0833740234375,
"epoch": 0.4674285714285714,
"grad_norm": 2.038965940475464,
"kl": 1.02734375,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0411,
"reward": 0.26453279703855515,
"reward_std": 0.19898507371544838,
"rewards/cosine_scaled_reward": 0.09699833486229181,
"rewards/format_reward": 0.8125000298023224,
"step": 409
},
{
"completion_length": 1686.0416870117188,
"epoch": 0.4685714285714286,
"grad_norm": 2.4619529247283936,
"kl": 1.145263671875,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0459,
"reward": 0.17928657121956348,
"reward_std": 0.18281647004187107,
"rewards/cosine_scaled_reward": -0.05253279022872448,
"rewards/format_reward": 0.7916666865348816,
"step": 410
},
{
"completion_length": 1760.2084350585938,
"epoch": 0.4697142857142857,
"grad_norm": 2.5391340255737305,
"kl": 1.0654296875,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0426,
"reward": 0.08570251986384392,
"reward_std": 0.2112545520067215,
"rewards/cosine_scaled_reward": -0.18753627687692642,
"rewards/format_reward": 0.708333358168602,
"step": 411
},
{
"completion_length": 1102.8542175292969,
"epoch": 0.47085714285714286,
"grad_norm": 0.7402982115745544,
"kl": 0.23406982421875,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0094,
"reward": 0.16945463605225086,
"reward_std": 0.13772385567426682,
"rewards/cosine_scaled_reward": -0.15359408780932426,
"rewards/format_reward": 0.9583333432674408,
"step": 412
},
{
"completion_length": 1526.5208740234375,
"epoch": 0.472,
"grad_norm": 2.8514583110809326,
"kl": 0.84326171875,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0338,
"reward": 0.07859875238500535,
"reward_std": 0.14428682066500187,
"rewards/cosine_scaled_reward": -0.23446397110819817,
"rewards/format_reward": 0.7708333432674408,
"step": 413
},
{
"completion_length": 1553.0208740234375,
"epoch": 0.47314285714285714,
"grad_norm": 1.3336910009384155,
"kl": 1.005615234375,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0403,
"reward": 0.24459942057728767,
"reward_std": 0.1802833005785942,
"rewards/cosine_scaled_reward": 0.06333907938096672,
"rewards/format_reward": 0.8125,
"step": 414
},
{
"completion_length": 1626.3334045410156,
"epoch": 0.4742857142857143,
"grad_norm": 2.653970718383789,
"kl": 0.87548828125,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.035,
"reward": 0.25224715657532215,
"reward_std": 0.24098404496908188,
"rewards/cosine_scaled_reward": 0.05381806939840317,
"rewards/format_reward": 0.8541666865348816,
"step": 415
},
{
"completion_length": 1558.2083740234375,
"epoch": 0.4754285714285714,
"grad_norm": 2.744907855987549,
"kl": 0.89990234375,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0359,
"reward": 0.16436111507937312,
"reward_std": 0.18552083894610405,
"rewards/cosine_scaled_reward": -0.0836386177688837,
"rewards/format_reward": 0.7916666865348816,
"step": 416
},
{
"completion_length": 1913.791748046875,
"epoch": 0.4765714285714286,
"grad_norm": 2.4131460189819336,
"kl": 1.46630859375,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0587,
"reward": 0.10155561100691557,
"reward_std": 0.19174444302916527,
"rewards/cosine_scaled_reward": -0.15042966604232788,
"rewards/format_reward": 0.6875000149011612,
"step": 417
},
{
"completion_length": 1520.8333740234375,
"epoch": 0.4777142857142857,
"grad_norm": 3.986926317214966,
"kl": 1.1474609375,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0459,
"reward": 0.1306799417361617,
"reward_std": 0.1696070432662964,
"rewards/cosine_scaled_reward": -0.15072840079665184,
"rewards/format_reward": 0.7916666865348816,
"step": 418
},
{
"completion_length": 1411.2500610351562,
"epoch": 0.47885714285714287,
"grad_norm": 1.6700776815414429,
"kl": 0.77392578125,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0309,
"reward": 0.11982119083404541,
"reward_std": 0.14616922289133072,
"rewards/cosine_scaled_reward": -0.2155298045836389,
"rewards/format_reward": 0.8958333432674408,
"step": 419
},
{
"completion_length": 1086.3750457763672,
"epoch": 0.48,
"grad_norm": 2.8618907928466797,
"kl": 0.39306640625,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0157,
"reward": 0.2151604052633047,
"reward_std": 0.20470884442329407,
"rewards/cosine_scaled_reward": -0.01572578027844429,
"rewards/format_reward": 0.8541666865348816,
"step": 420
},
{
"completion_length": 1218.75,
"epoch": 0.48114285714285715,
"grad_norm": 1.8824008703231812,
"kl": 0.43896484375,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0175,
"reward": 0.27937930822372437,
"reward_std": 0.15520622581243515,
"rewards/cosine_scaled_reward": 0.03862675465643406,
"rewards/format_reward": 0.9791666716337204,
"step": 421
},
{
"completion_length": 1495.3333740234375,
"epoch": 0.48228571428571426,
"grad_norm": 1.9558619260787964,
"kl": 0.87109375,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0348,
"reward": 0.18802766874432564,
"reward_std": 0.15786195173859596,
"rewards/cosine_scaled_reward": -0.1094935517758131,
"rewards/format_reward": 0.9375000149011612,
"step": 422
},
{
"completion_length": 1453.8125457763672,
"epoch": 0.48342857142857143,
"grad_norm": 3.1225712299346924,
"kl": 0.58935546875,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0235,
"reward": 0.1692509911954403,
"reward_std": 0.1672309935092926,
"rewards/cosine_scaled_reward": -0.09207071270793676,
"rewards/format_reward": 0.8333333432674408,
"step": 423
},
{
"completion_length": 1574.9583740234375,
"epoch": 0.4845714285714286,
"grad_norm": 1.3268849849700928,
"kl": 0.6416015625,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0256,
"reward": 0.20429514534771442,
"reward_std": 0.14665967971086502,
"rewards/cosine_scaled_reward": -0.08527943585067987,
"rewards/format_reward": 0.9583333432674408,
"step": 424
},
{
"completion_length": 1544.3542175292969,
"epoch": 0.4857142857142857,
"grad_norm": 2.838041067123413,
"kl": 0.59716796875,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0239,
"reward": 0.15881402208469808,
"reward_std": 0.23204398341476917,
"rewards/cosine_scaled_reward": -0.10290774330496788,
"rewards/format_reward": 0.8125000149011612,
"step": 425
},
{
"completion_length": 1421.0208740234375,
"epoch": 0.4868571428571429,
"grad_norm": 2.232002019882202,
"kl": 0.671875,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0269,
"reward": 0.16967828944325447,
"reward_std": 0.22798865288496017,
"rewards/cosine_scaled_reward": -0.06918483227491379,
"rewards/format_reward": 0.7916666865348816,
"step": 426
},
{
"completion_length": 1699.3125610351562,
"epoch": 0.488,
"grad_norm": 1.7416229248046875,
"kl": 1.1943359375,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0478,
"reward": 0.16669721342623234,
"reward_std": 0.23273145407438278,
"rewards/cosine_scaled_reward": -0.0462256595492363,
"rewards/format_reward": 0.7291666865348816,
"step": 427
},
{
"completion_length": 1743.4375610351562,
"epoch": 0.48914285714285716,
"grad_norm": 1.2741050720214844,
"kl": 0.7529296875,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0301,
"reward": 0.1672147586941719,
"reward_std": 0.12992947921156883,
"rewards/cosine_scaled_reward": -0.1202060398645699,
"rewards/format_reward": 0.8750000149011612,
"step": 428
},
{
"completion_length": 1644.7916870117188,
"epoch": 0.49028571428571427,
"grad_norm": 2.6569628715515137,
"kl": 0.9384765625,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0375,
"reward": 0.19867986720055342,
"reward_std": 0.2007097192108631,
"rewards/cosine_scaled_reward": -0.02384123019874096,
"rewards/format_reward": 0.8125000149011612,
"step": 429
},
{
"completion_length": 1961.1666870117188,
"epoch": 0.49142857142857144,
"grad_norm": 1.7140686511993408,
"kl": 1.21484375,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0486,
"reward": 0.10232608858495951,
"reward_std": 0.16942250356078148,
"rewards/cosine_scaled_reward": -0.09778407961130142,
"rewards/format_reward": 0.5833333656191826,
"step": 430
},
{
"completion_length": 1212.3333740234375,
"epoch": 0.49257142857142855,
"grad_norm": 2.095090866088867,
"kl": 0.676513671875,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.027,
"reward": 0.25252123549580574,
"reward_std": 0.17253945022821426,
"rewards/cosine_scaled_reward": 0.02987060695886612,
"rewards/format_reward": 0.895833358168602,
"step": 431
},
{
"completion_length": 1627.5833740234375,
"epoch": 0.4937142857142857,
"grad_norm": 1.731967806816101,
"kl": 1.109375,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0444,
"reward": 0.1790441758930683,
"reward_std": 0.21966011822223663,
"rewards/cosine_scaled_reward": -0.07679930981248617,
"rewards/format_reward": 0.8333333432674408,
"step": 432
},
{
"completion_length": 1566.9583587646484,
"epoch": 0.4948571428571429,
"grad_norm": 2.4567792415618896,
"kl": 0.8028564453125,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0321,
"reward": 0.16420520408428274,
"reward_std": 0.19683999940752983,
"rewards/cosine_scaled_reward": -0.123224092181772,
"rewards/format_reward": 0.8750000149011612,
"step": 433
},
{
"completion_length": 982.0000610351562,
"epoch": 0.496,
"grad_norm": 19.401472091674805,
"kl": 0.561065673828125,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0225,
"reward": 0.32596323639154434,
"reward_std": 0.19744369760155678,
"rewards/cosine_scaled_reward": 0.14631427451968193,
"rewards/format_reward": 0.9375000149011612,
"step": 434
},
{
"completion_length": 1534.7917175292969,
"epoch": 0.49714285714285716,
"grad_norm": 6.2816362380981445,
"kl": 1.1749267578125,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0471,
"reward": 0.13189730420708656,
"reward_std": 0.14833365753293037,
"rewards/cosine_scaled_reward": -0.17807801440358162,
"rewards/format_reward": 0.8541666716337204,
"step": 435
},
{
"completion_length": 1608.6875305175781,
"epoch": 0.4982857142857143,
"grad_norm": 1.2652208805084229,
"kl": 0.8369140625,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0334,
"reward": 0.1510629504919052,
"reward_std": 0.20369192957878113,
"rewards/cosine_scaled_reward": -0.11619596276432276,
"rewards/format_reward": 0.8125000149011612,
"step": 436
},
{
"completion_length": 1255.1458740234375,
"epoch": 0.49942857142857144,
"grad_norm": 1.7958463430404663,
"kl": 0.46435546875,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0186,
"reward": 0.23130175843834877,
"reward_std": 0.1911556702107191,
"rewards/cosine_scaled_reward": -0.006543227471411228,
"rewards/format_reward": 0.895833358168602,
"step": 437
},
{
"completion_length": 1066.3333740234375,
"epoch": 0.5005714285714286,
"grad_norm": 2.252562999725342,
"kl": 0.41064453125,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0164,
"reward": 0.21582691743969917,
"reward_std": 0.18366244062781334,
"rewards/cosine_scaled_reward": -0.07878882065415382,
"rewards/format_reward": 0.9791666716337204,
"step": 438
},
{
"completion_length": 1176.6667022705078,
"epoch": 0.5017142857142857,
"grad_norm": 1.840851068496704,
"kl": 0.83984375,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0336,
"reward": 0.1915587205439806,
"reward_std": 0.15870841406285763,
"rewards/cosine_scaled_reward": -0.08411563746631145,
"rewards/format_reward": 0.895833358168602,
"step": 439
},
{
"completion_length": 1799.9583740234375,
"epoch": 0.5028571428571429,
"grad_norm": 1.910586953163147,
"kl": 1.419921875,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0569,
"reward": 0.15009203157387674,
"reward_std": 0.16570111364126205,
"rewards/cosine_scaled_reward": -0.08884701132774353,
"rewards/format_reward": 0.7500000298023224,
"step": 440
},
{
"completion_length": 1417.562515258789,
"epoch": 0.504,
"grad_norm": 2.846151828765869,
"kl": 0.65283203125,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0261,
"reward": 0.25939593836665154,
"reward_std": 0.1876070685684681,
"rewards/cosine_scaled_reward": 0.09080945514142513,
"rewards/format_reward": 0.8125000149011612,
"step": 441
},
{
"completion_length": 1395.0417175292969,
"epoch": 0.5051428571428571,
"grad_norm": 1.3554480075836182,
"kl": 0.4969482421875,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0199,
"reward": 0.1746810134500265,
"reward_std": 0.18113730661571026,
"rewards/cosine_scaled_reward": -0.11311907507479191,
"rewards/format_reward": 0.8958333432674408,
"step": 442
},
{
"completion_length": 1326.6250305175781,
"epoch": 0.5062857142857143,
"grad_norm": 2.2575602531433105,
"kl": 0.5147705078125,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0206,
"reward": 0.17062465287745,
"reward_std": 0.17788580060005188,
"rewards/cosine_scaled_reward": -0.12135545909404755,
"rewards/format_reward": 0.895833358168602,
"step": 443
},
{
"completion_length": 1452.4375305175781,
"epoch": 0.5074285714285715,
"grad_norm": 2.0496582984924316,
"kl": 0.669921875,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0267,
"reward": 0.21649761497974396,
"reward_std": 0.1603663358837366,
"rewards/cosine_scaled_reward": -0.0013428553938865662,
"rewards/format_reward": 0.8333333432674408,
"step": 444
},
{
"completion_length": 1655.3750610351562,
"epoch": 0.5085714285714286,
"grad_norm": 1.7483857870101929,
"kl": 1.0380859375,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0415,
"reward": 0.14976151008158922,
"reward_std": 0.12271312810480595,
"rewards/cosine_scaled_reward": -0.10680487379431725,
"rewards/format_reward": 0.7916666716337204,
"step": 445
},
{
"completion_length": 1138.9375457763672,
"epoch": 0.5097142857142857,
"grad_norm": 1.3604767322540283,
"kl": 0.5885009765625,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0235,
"reward": 0.20577935874462128,
"reward_std": 0.19339029118418694,
"rewards/cosine_scaled_reward": -0.0911331009119749,
"rewards/format_reward": 0.9583333432674408,
"step": 446
},
{
"completion_length": 1263.1250610351562,
"epoch": 0.5108571428571429,
"grad_norm": 1.865720510482788,
"kl": 1.154052734375,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0462,
"reward": 0.14829288161126897,
"reward_std": 0.17679531127214432,
"rewards/cosine_scaled_reward": -0.14904707111418247,
"rewards/format_reward": 0.8541666865348816,
"step": 447
},
{
"completion_length": 1118.375015258789,
"epoch": 0.512,
"grad_norm": 1.1738076210021973,
"kl": 0.08587646484375,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0034,
"reward": 0.29666774719953537,
"reward_std": 0.2335982620716095,
"rewards/cosine_scaled_reward": 0.07708277204073966,
"rewards/format_reward": 0.9791666716337204,
"step": 448
},
{
"completion_length": 1263.687515258789,
"epoch": 0.5131428571428571,
"grad_norm": 2.511007308959961,
"kl": 0.743408203125,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0297,
"reward": 0.17201264947652817,
"reward_std": 0.2096976675093174,
"rewards/cosine_scaled_reward": -0.11413275334052742,
"rewards/format_reward": 0.8750000298023224,
"step": 449
},
{
"completion_length": 1255.812515258789,
"epoch": 0.5142857142857142,
"grad_norm": 3.0915615558624268,
"kl": 1.0966796875,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.044,
"reward": 0.19247347861528397,
"reward_std": 0.2065977193415165,
"rewards/cosine_scaled_reward": -0.023884066613391042,
"rewards/format_reward": 0.770833358168602,
"step": 450
},
{
"completion_length": 1148.1667175292969,
"epoch": 0.5154285714285715,
"grad_norm": 2.033189296722412,
"kl": 0.53155517578125,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0212,
"reward": 0.25062017887830734,
"reward_std": 0.1323441956192255,
"rewards/cosine_scaled_reward": 0.031208358705043793,
"rewards/format_reward": 0.8958333432674408,
"step": 451
},
{
"completion_length": 1149.8542022705078,
"epoch": 0.5165714285714286,
"grad_norm": 2.3381247520446777,
"kl": 0.7998046875,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.032,
"reward": 0.09467406664043665,
"reward_std": 0.1573140937834978,
"rewards/cosine_scaled_reward": -0.23562941327691078,
"rewards/format_reward": 0.833333358168602,
"step": 452
},
{
"completion_length": 1543.8542175292969,
"epoch": 0.5177142857142857,
"grad_norm": 3.0497689247131348,
"kl": 1.39990234375,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0559,
"reward": 0.23793689720332623,
"reward_std": 0.24361500516533852,
"rewards/cosine_scaled_reward": 0.05131397116929293,
"rewards/format_reward": 0.7916666865348816,
"step": 453
},
{
"completion_length": 995.3333740234375,
"epoch": 0.5188571428571429,
"grad_norm": 2.2683708667755127,
"kl": 0.62646484375,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0251,
"reward": 0.16244725324213505,
"reward_std": 0.1817509774118662,
"rewards/cosine_scaled_reward": -0.1395284836180508,
"rewards/format_reward": 0.895833358168602,
"step": 454
},
{
"completion_length": 1405.8333435058594,
"epoch": 0.52,
"grad_norm": 1.8915313482284546,
"kl": 1.2333984375,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0494,
"reward": 0.15508674364537,
"reward_std": 0.15496904775500298,
"rewards/cosine_scaled_reward": -0.12304865941405296,
"rewards/format_reward": 0.8333333432674408,
"step": 455
},
{
"completion_length": 1620.3750305175781,
"epoch": 0.5211428571428571,
"grad_norm": 2.3792831897735596,
"kl": 0.9970703125,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0399,
"reward": 0.19286850281059742,
"reward_std": 0.1363586913794279,
"rewards/cosine_scaled_reward": -0.02547831228002906,
"rewards/format_reward": 0.7916666865348816,
"step": 456
},
{
"completion_length": 1495.729248046875,
"epoch": 0.5222857142857142,
"grad_norm": 2.826035737991333,
"kl": 1.41796875,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0568,
"reward": 0.18930382770486176,
"reward_std": 0.19817211106419563,
"rewards/cosine_scaled_reward": -0.011334592942148447,
"rewards/format_reward": 0.7500000149011612,
"step": 457
},
{
"completion_length": 1296.2708740234375,
"epoch": 0.5234285714285715,
"grad_norm": 2.695727586746216,
"kl": 0.623291015625,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.025,
"reward": 0.21301186457276344,
"reward_std": 0.2375541441142559,
"rewards/cosine_scaled_reward": -0.030085243575740606,
"rewards/format_reward": 0.8750000149011612,
"step": 458
},
{
"completion_length": 1180.8958740234375,
"epoch": 0.5245714285714286,
"grad_norm": 1.4999505281448364,
"kl": 0.8074951171875,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0323,
"reward": 0.19034619256854057,
"reward_std": 0.23577242344617844,
"rewards/cosine_scaled_reward": -0.04188129701651633,
"rewards/format_reward": 0.8125000149011612,
"step": 459
},
{
"completion_length": 1674.2500610351562,
"epoch": 0.5257142857142857,
"grad_norm": 3.6569175720214844,
"kl": 1.400390625,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0561,
"reward": 0.20811645686626434,
"reward_std": 0.24132521450519562,
"rewards/cosine_scaled_reward": -0.020151358097791672,
"rewards/format_reward": 0.833333358168602,
"step": 460
},
{
"completion_length": 1561.0625305175781,
"epoch": 0.5268571428571428,
"grad_norm": 2.9864399433135986,
"kl": 0.868896484375,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0348,
"reward": 0.23855134937912226,
"reward_std": 0.190349493175745,
"rewards/cosine_scaled_reward": 0.05257879290729761,
"rewards/format_reward": 0.8125000149011612,
"step": 461
},
{
"completion_length": 1403.0417022705078,
"epoch": 0.528,
"grad_norm": 2.8803505897521973,
"kl": 0.5537109375,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0222,
"reward": 0.1879023276269436,
"reward_std": 0.17408592253923416,
"rewards/cosine_scaled_reward": -0.11774074472486973,
"rewards/format_reward": 0.9583333432674408,
"step": 462
},
{
"completion_length": 1038.3958740234375,
"epoch": 0.5291428571428571,
"grad_norm": 2.510594367980957,
"kl": 0.8834228515625,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0353,
"reward": 0.2329028071835637,
"reward_std": 0.1337026245892048,
"rewards/cosine_scaled_reward": 0.04953182302415371,
"rewards/format_reward": 0.7916666716337204,
"step": 463
},
{
"completion_length": 1624.3959045410156,
"epoch": 0.5302857142857142,
"grad_norm": 2.223735809326172,
"kl": 0.958251953125,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0383,
"reward": 0.24998359940946102,
"reward_std": 0.1796153038740158,
"rewards/cosine_scaled_reward": 0.052480582147836685,
"rewards/format_reward": 0.8541666865348816,
"step": 464
},
{
"completion_length": 1534.479248046875,
"epoch": 0.5314285714285715,
"grad_norm": 1.6499871015548706,
"kl": 1.323486328125,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0531,
"reward": 0.1806503850966692,
"reward_std": 0.18742095679044724,
"rewards/cosine_scaled_reward": -0.052027489989995956,
"rewards/format_reward": 0.7916666865348816,
"step": 465
},
{
"completion_length": 1744.5209045410156,
"epoch": 0.5325714285714286,
"grad_norm": 1.7625157833099365,
"kl": 0.74755859375,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0299,
"reward": 0.12266920693218708,
"reward_std": 0.14947674423456192,
"rewards/cosine_scaled_reward": -0.1689941380172968,
"rewards/format_reward": 0.8125000298023224,
"step": 466
},
{
"completion_length": 1490.854232788086,
"epoch": 0.5337142857142857,
"grad_norm": 2.6080636978149414,
"kl": 1.408203125,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0562,
"reward": 0.1923494804650545,
"reward_std": 0.18007073551416397,
"rewards/cosine_scaled_reward": -0.0493585430085659,
"rewards/format_reward": 0.833333358168602,
"step": 467
},
{
"completion_length": 1635.5001068115234,
"epoch": 0.5348571428571428,
"grad_norm": 5.10135555267334,
"kl": 1.4853515625,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0594,
"reward": 0.2753597334958613,
"reward_std": 0.18230173736810684,
"rewards/cosine_scaled_reward": 0.1316775605082512,
"rewards/format_reward": 0.7916666716337204,
"step": 468
},
{
"completion_length": 1196.0208740234375,
"epoch": 0.536,
"grad_norm": 3.6090939044952393,
"kl": 0.96044921875,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0385,
"reward": 0.15217299573123455,
"reward_std": 0.16991863399744034,
"rewards/cosine_scaled_reward": -0.15571825858205557,
"rewards/format_reward": 0.895833358168602,
"step": 469
},
{
"completion_length": 1328.2917022705078,
"epoch": 0.5371428571428571,
"grad_norm": 2.7696914672851562,
"kl": 1.2490234375,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.05,
"reward": 0.2623746544122696,
"reward_std": 0.1682613156735897,
"rewards/cosine_scaled_reward": 0.06257599592208862,
"rewards/format_reward": 0.8750000149011612,
"step": 470
},
{
"completion_length": 1151.2500305175781,
"epoch": 0.5382857142857143,
"grad_norm": 2.2605948448181152,
"kl": 1.0380859375,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0415,
"reward": 0.1418502125889063,
"reward_std": 0.1402505859732628,
"rewards/cosine_scaled_reward": -0.16590357944369316,
"rewards/format_reward": 0.8750000149011612,
"step": 471
},
{
"completion_length": 881.2916717529297,
"epoch": 0.5394285714285715,
"grad_norm": 4.6680588722229,
"kl": 0.572265625,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0229,
"reward": 0.2540171667933464,
"reward_std": 0.21371759288012981,
"rewards/cosine_scaled_reward": 0.0268446896225214,
"rewards/format_reward": 0.9166666865348816,
"step": 472
},
{
"completion_length": 1405.8958740234375,
"epoch": 0.5405714285714286,
"grad_norm": 3.778167963027954,
"kl": 1.0146484375,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0405,
"reward": 0.08911328506655991,
"reward_std": 0.14409188739955425,
"rewards/cosine_scaled_reward": -0.25133184157311916,
"rewards/format_reward": 0.833333358168602,
"step": 473
},
{
"completion_length": 1369.7083892822266,
"epoch": 0.5417142857142857,
"grad_norm": 2.8626537322998047,
"kl": 0.7666015625,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0307,
"reward": 0.16908735921606421,
"reward_std": 0.17285825684666634,
"rewards/cosine_scaled_reward": -0.10159570351243019,
"rewards/format_reward": 0.8541666865348816,
"step": 474
},
{
"completion_length": 1265.7083740234375,
"epoch": 0.5428571428571428,
"grad_norm": 1.8364455699920654,
"kl": 0.827392578125,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0331,
"reward": 0.19515487863100134,
"reward_std": 0.15950417518615723,
"rewards/cosine_scaled_reward": -0.08009354583919048,
"rewards/format_reward": 0.8958333432674408,
"step": 475
},
{
"completion_length": 1177.2917022705078,
"epoch": 0.544,
"grad_norm": 3.9011993408203125,
"kl": 0.51513671875,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0206,
"reward": 0.2876299601048231,
"reward_std": 0.1502437572926283,
"rewards/cosine_scaled_reward": 0.12480364367365837,
"rewards/format_reward": 0.8541666865348816,
"step": 476
},
{
"completion_length": 1044.687515258789,
"epoch": 0.5451428571428572,
"grad_norm": 1.5102007389068604,
"kl": 0.4571533203125,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0183,
"reward": 0.2441038154065609,
"reward_std": 0.17297399789094925,
"rewards/cosine_scaled_reward": 0.020557187497615814,
"rewards/format_reward": 0.8958333432674408,
"step": 477
},
{
"completion_length": 1297.6875457763672,
"epoch": 0.5462857142857143,
"grad_norm": 2.5546422004699707,
"kl": 0.71649169921875,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0286,
"reward": 0.17578930966556072,
"reward_std": 0.11584887467324734,
"rewards/cosine_scaled_reward": -0.1447231061756611,
"rewards/format_reward": 0.9583333432674408,
"step": 478
},
{
"completion_length": 1317.3542022705078,
"epoch": 0.5474285714285714,
"grad_norm": 2.733382225036621,
"kl": 0.9921875,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0398,
"reward": 0.2549874298274517,
"reward_std": 0.23050136864185333,
"rewards/cosine_scaled_reward": 0.07073704898357391,
"rewards/format_reward": 0.8333333432674408,
"step": 479
},
{
"completion_length": 1605.8333740234375,
"epoch": 0.5485714285714286,
"grad_norm": 2.0302486419677734,
"kl": 0.89404296875,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0358,
"reward": 0.19943943247199059,
"reward_std": 0.2432672716677189,
"rewards/cosine_scaled_reward": -0.05472554266452789,
"rewards/format_reward": 0.8750000149011612,
"step": 480
},
{
"completion_length": 1229.3750305175781,
"epoch": 0.5497142857142857,
"grad_norm": 3.4060001373291016,
"kl": 0.708984375,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0283,
"reward": 0.13737879018299282,
"reward_std": 0.14800015836954117,
"rewards/cosine_scaled_reward": -0.18615208379924297,
"rewards/format_reward": 0.8958333432674408,
"step": 481
},
{
"completion_length": 1038.5625305175781,
"epoch": 0.5508571428571428,
"grad_norm": 1.422458291053772,
"kl": 0.296142578125,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0118,
"reward": 0.2588203465566039,
"reward_std": 0.14372991025447845,
"rewards/cosine_scaled_reward": 0.014510583132505417,
"rewards/format_reward": 0.9583333432674408,
"step": 482
},
{
"completion_length": 1331.8542175292969,
"epoch": 0.552,
"grad_norm": 4.249495029449463,
"kl": 1.3828125,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0552,
"reward": 0.25617819651961327,
"reward_std": 0.18873486295342445,
"rewards/cosine_scaled_reward": 0.04960942268371582,
"rewards/format_reward": 0.8750000149011612,
"step": 483
},
{
"completion_length": 1384.0833740234375,
"epoch": 0.5531428571428572,
"grad_norm": 2.3114612102508545,
"kl": 0.849609375,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.034,
"reward": 0.28373695723712444,
"reward_std": 0.23551687598228455,
"rewards/cosine_scaled_reward": 0.09562918171286583,
"rewards/format_reward": 0.895833358168602,
"step": 484
},
{
"completion_length": 1479.2292022705078,
"epoch": 0.5542857142857143,
"grad_norm": 2.420201063156128,
"kl": 1.4007568359375,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.056,
"reward": 0.160817326977849,
"reward_std": 0.18855691701173782,
"rewards/cosine_scaled_reward": -0.10896717384457588,
"rewards/format_reward": 0.8125000149011612,
"step": 485
},
{
"completion_length": 1231.2292022705078,
"epoch": 0.5554285714285714,
"grad_norm": 2.2929489612579346,
"kl": 0.67431640625,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.027,
"reward": 0.27992733381688595,
"reward_std": 0.1733872890472412,
"rewards/cosine_scaled_reward": 0.0710078589618206,
"rewards/format_reward": 0.9375000149011612,
"step": 486
},
{
"completion_length": 1449.291748046875,
"epoch": 0.5565714285714286,
"grad_norm": 4.631397724151611,
"kl": 0.955078125,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0381,
"reward": 0.19613806903362274,
"reward_std": 0.18428167328238487,
"rewards/cosine_scaled_reward": -0.10239327140152454,
"rewards/format_reward": 0.9583333432674408,
"step": 487
},
{
"completion_length": 1643.8750305175781,
"epoch": 0.5577142857142857,
"grad_norm": 2.999095916748047,
"kl": 1.26953125,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0508,
"reward": 0.14464829117059708,
"reward_std": 0.164125744253397,
"rewards/cosine_scaled_reward": -0.15228741243481636,
"rewards/format_reward": 0.8541666865348816,
"step": 488
},
{
"completion_length": 855.1250228881836,
"epoch": 0.5588571428571428,
"grad_norm": 2.9949464797973633,
"kl": 0.30072021484375,
"learning_rate": 1.013262614978859e-07,
"loss": 0.012,
"reward": 0.2884952202439308,
"reward_std": 0.1300883013755083,
"rewards/cosine_scaled_reward": 0.063195139169693,
"rewards/format_reward": 0.9791666716337204,
"step": 489
},
{
"completion_length": 1136.1250305175781,
"epoch": 0.56,
"grad_norm": 1.8440228700637817,
"kl": 0.775390625,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.031,
"reward": 0.17430318985134363,
"reward_std": 0.1413884162902832,
"rewards/cosine_scaled_reward": -0.11317074298858643,
"rewards/format_reward": 0.8958333432674408,
"step": 490
},
{
"completion_length": 1581.7708587646484,
"epoch": 0.5611428571428572,
"grad_norm": 2.8860220909118652,
"kl": 0.88128662109375,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0353,
"reward": 0.18477380648255348,
"reward_std": 0.16339224576950073,
"rewards/cosine_scaled_reward": -0.032030028640292585,
"rewards/format_reward": 0.770833358168602,
"step": 491
},
{
"completion_length": 1146.4166717529297,
"epoch": 0.5622857142857143,
"grad_norm": 2.8887076377868652,
"kl": 1.0263671875,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0411,
"reward": 0.20537487417459488,
"reward_std": 0.17985284700989723,
"rewards/cosine_scaled_reward": -0.05676530674099922,
"rewards/format_reward": 0.8958333432674408,
"step": 492
},
{
"completion_length": 1219.9167175292969,
"epoch": 0.5634285714285714,
"grad_norm": 11.495869636535645,
"kl": 0.35302734375,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0141,
"reward": 0.17675728350877762,
"reward_std": 0.16118020564317703,
"rewards/cosine_scaled_reward": -0.13073305413126945,
"rewards/format_reward": 0.9375000149011612,
"step": 493
},
{
"completion_length": 1401.2500305175781,
"epoch": 0.5645714285714286,
"grad_norm": 3.791532278060913,
"kl": 1.544677734375,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0618,
"reward": 0.12077869102358818,
"reward_std": 0.1393336970359087,
"rewards/cosine_scaled_reward": -0.20588408038020134,
"rewards/format_reward": 0.8750000149011612,
"step": 494
},
{
"completion_length": 1228.2708740234375,
"epoch": 0.5657142857142857,
"grad_norm": 2.0392343997955322,
"kl": 0.72314453125,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0289,
"reward": 0.2310006357729435,
"reward_std": 0.14271893352270126,
"rewards/cosine_scaled_reward": -0.00487779825925827,
"rewards/format_reward": 0.895833358168602,
"step": 495
},
{
"completion_length": 1170.8541870117188,
"epoch": 0.5668571428571428,
"grad_norm": 2.2888383865356445,
"kl": 0.8193359375,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0328,
"reward": 0.33169906958937645,
"reward_std": 0.14619917422533035,
"rewards/cosine_scaled_reward": 0.18171185720711946,
"rewards/format_reward": 0.8958333432674408,
"step": 496
},
{
"completion_length": 1481.7292175292969,
"epoch": 0.568,
"grad_norm": 2.517068386077881,
"kl": 1.252685546875,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0501,
"reward": 0.2681533806025982,
"reward_std": 0.20135387405753136,
"rewards/cosine_scaled_reward": 0.10805653966963291,
"rewards/format_reward": 0.8125000149011612,
"step": 497
},
{
"completion_length": 1335.375,
"epoch": 0.5691428571428572,
"grad_norm": 2.2976410388946533,
"kl": 0.9261474609375,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0372,
"reward": 0.156329445540905,
"reward_std": 0.1802426353096962,
"rewards/cosine_scaled_reward": -0.16677019745111465,
"rewards/format_reward": 0.9375000149011612,
"step": 498
},
{
"completion_length": 1487.9583740234375,
"epoch": 0.5702857142857143,
"grad_norm": 1.1875653266906738,
"kl": 0.619964599609375,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0248,
"reward": 0.12715653842315078,
"reward_std": 0.1571529433131218,
"rewards/cosine_scaled_reward": -0.2139046173542738,
"rewards/format_reward": 0.9166666865348816,
"step": 499
},
{
"completion_length": 1511.5833740234375,
"epoch": 0.5714285714285714,
"grad_norm": 3.7676918506622314,
"kl": 1.2734375,
"learning_rate": 1e-07,
"loss": 0.051,
"reward": 0.1403972152620554,
"reward_std": 0.23328615352511406,
"rewards/cosine_scaled_reward": -0.10787822678685188,
"rewards/format_reward": 0.7500000149011612,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.014876242799652573,
"train_runtime": 36292.0916,
"train_samples_per_second": 0.661,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}