OpenRS-RLoRA-LoftQ-R16 / trainer_state.json
ajarts88's picture
Model save
d84f8bc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2857142857142857,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 3094.7083435058594,
"epoch": 0.0005714285714285715,
"grad_norm": 0.03110666386783123,
"kl": 0.001583099365234375,
"learning_rate": 2e-08,
"loss": -0.0184,
"reward": 0.137441948056221,
"reward_std": 0.21623625233769417,
"rewards/cosine_scaled_reward": -0.056279025971889496,
"rewards/format_reward": 0.25,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2744.875045776367,
"epoch": 0.001142857142857143,
"grad_norm": 0.032704442739486694,
"kl": 0.0012607574462890625,
"learning_rate": 4e-08,
"loss": 0.0647,
"reward": -0.10802149772644043,
"reward_std": 0.2796471230685711,
"rewards/cosine_scaled_reward": -0.2831774242222309,
"rewards/format_reward": 0.4583333432674408,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2880.9166870117188,
"epoch": 0.0017142857142857142,
"grad_norm": 0.04212239384651184,
"kl": 0.0011577606201171875,
"learning_rate": 6e-08,
"loss": 0.078,
"reward": 0.5520303323864937,
"reward_std": 0.7616261765360832,
"rewards/cosine_scaled_reward": 0.08851515129208565,
"rewards/format_reward": 0.3750000149011612,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2819.7916870117188,
"epoch": 0.002285714285714286,
"grad_norm": 0.0290143433958292,
"kl": 0.001392364501953125,
"learning_rate": 8e-08,
"loss": 0.0459,
"reward": 0.10864349454641342,
"reward_std": 0.5680626817047596,
"rewards/cosine_scaled_reward": -0.1956782626803033,
"rewards/format_reward": 0.5000000111758709,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2818.1666717529297,
"epoch": 0.002857142857142857,
"grad_norm": 0.055745821446180344,
"kl": 0.001789093017578125,
"learning_rate": 1e-07,
"loss": -0.0589,
"reward": -0.3098345100879669,
"reward_std": 0.3320516124367714,
"rewards/cosine_scaled_reward": -0.30075057595968246,
"rewards/format_reward": 0.2916666679084301,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 2811.625,
"epoch": 0.0034285714285714284,
"grad_norm": 0.044995225965976715,
"kl": 0.0012950897216796875,
"learning_rate": 1.2e-07,
"loss": 0.0259,
"reward": -0.19174443185329437,
"reward_std": 0.1734611876308918,
"rewards/cosine_scaled_reward": -0.22087222337722778,
"rewards/format_reward": 0.25,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 2999.750030517578,
"epoch": 0.004,
"grad_norm": 0.03707379847764969,
"kl": 0.0017557144165039062,
"learning_rate": 1.4e-07,
"loss": 0.254,
"reward": 0.20309243840165436,
"reward_std": 0.9162530265748501,
"rewards/cosine_scaled_reward": -0.06512045487761497,
"rewards/format_reward": 0.3333333432674408,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 3069.3333435058594,
"epoch": 0.004571428571428572,
"grad_norm": 0.028608666732907295,
"kl": 0.0014400482177734375,
"learning_rate": 1.6e-07,
"loss": -0.0445,
"reward": -0.5495130568742752,
"reward_std": 0.15696508716791868,
"rewards/cosine_scaled_reward": -0.3997565358877182,
"rewards/format_reward": 0.25,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2464.5000610351562,
"epoch": 0.005142857142857143,
"grad_norm": 0.033542804419994354,
"kl": 0.0013322830200195312,
"learning_rate": 1.8e-07,
"loss": 0.0334,
"reward": 0.45928669965360314,
"reward_std": 0.5223925784230232,
"rewards/cosine_scaled_reward": 0.00047670304775238037,
"rewards/format_reward": 0.4583333395421505,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 2755.7084045410156,
"epoch": 0.005714285714285714,
"grad_norm": 0.02737303078174591,
"kl": 0.0010824203491210938,
"learning_rate": 2e-07,
"loss": 0.077,
"reward": 0.8539811000227928,
"reward_std": 0.9332205802202225,
"rewards/cosine_scaled_reward": 0.15615718998014927,
"rewards/format_reward": 0.5416666716337204,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2715.375030517578,
"epoch": 0.006285714285714286,
"grad_norm": 0.0531262531876564,
"kl": 0.001949310302734375,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0804,
"reward": 0.17209979891777039,
"reward_std": 0.9906110465526581,
"rewards/cosine_scaled_reward": -0.1431167609989643,
"rewards/format_reward": 0.4583333395421505,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2672.8334045410156,
"epoch": 0.006857142857142857,
"grad_norm": 0.031137602403759956,
"kl": 0.0013990402221679688,
"learning_rate": 2.4e-07,
"loss": -0.0686,
"reward": 0.5019373241811991,
"reward_std": 0.5215075016021729,
"rewards/cosine_scaled_reward": -0.019864672794938087,
"rewards/format_reward": 0.5416666679084301,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2618.666717529297,
"epoch": 0.0074285714285714285,
"grad_norm": 0.03611765801906586,
"kl": 0.0011806488037109375,
"learning_rate": 2.6e-07,
"loss": 0.1676,
"reward": 0.41887298226356506,
"reward_std": 0.8584047667682171,
"rewards/cosine_scaled_reward": -0.040563490241765976,
"rewards/format_reward": 0.5000000074505806,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2235.4166870117188,
"epoch": 0.008,
"grad_norm": 0.04153580963611603,
"kl": 0.00130462646484375,
"learning_rate": 2.8e-07,
"loss": 0.0347,
"reward": 0.9663210958242416,
"reward_std": 0.7360753519460559,
"rewards/cosine_scaled_reward": 0.1706605376675725,
"rewards/format_reward": 0.6250000149011612,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2534.7083435058594,
"epoch": 0.008571428571428572,
"grad_norm": 0.03185239061713219,
"kl": 0.001331329345703125,
"learning_rate": 3e-07,
"loss": -0.0571,
"reward": 0.09811164997518063,
"reward_std": 0.2653252240270376,
"rewards/cosine_scaled_reward": -0.24261085875332355,
"rewards/format_reward": 0.5833333358168602,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 3427.0000610351562,
"epoch": 0.009142857142857144,
"grad_norm": 0.025425124913454056,
"kl": 0.0014238357543945312,
"learning_rate": 3.2e-07,
"loss": 0.0393,
"reward": 0.11474913358688354,
"reward_std": 0.7180598303675652,
"rewards/cosine_scaled_reward": -0.08845876529812813,
"rewards/format_reward": 0.2916666716337204,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 3092.2083740234375,
"epoch": 0.009714285714285713,
"grad_norm": 0.06465068459510803,
"kl": 0.001911163330078125,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.2147,
"reward": -0.37063759565353394,
"reward_std": 0.35601309686899185,
"rewards/cosine_scaled_reward": -0.26865213364362717,
"rewards/format_reward": 0.1666666716337204,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 3264.9166870117188,
"epoch": 0.010285714285714285,
"grad_norm": 0.02422996051609516,
"kl": 0.0014934539794921875,
"learning_rate": 3.6e-07,
"loss": 0.0111,
"reward": -0.10622220486402512,
"reward_std": 0.8182276710867882,
"rewards/cosine_scaled_reward": -0.17811110243201256,
"rewards/format_reward": 0.2500000111758709,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 2721.916717529297,
"epoch": 0.010857142857142857,
"grad_norm": 0.033733464777469635,
"kl": 0.0017070770263671875,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.012,
"reward": 0.0332348570227623,
"reward_std": 0.4489757791161537,
"rewards/cosine_scaled_reward": -0.17088256403803825,
"rewards/format_reward": 0.375,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1720.9583435058594,
"epoch": 0.011428571428571429,
"grad_norm": 0.050742171704769135,
"kl": 0.0010938644409179688,
"learning_rate": 4e-07,
"loss": 0.0322,
"reward": 0.5358797460794449,
"reward_std": 0.697529599070549,
"rewards/cosine_scaled_reward": -0.08622681722044945,
"rewards/format_reward": 0.7083333432674408,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1904.9167175292969,
"epoch": 0.012,
"grad_norm": 0.05997886881232262,
"kl": 0.0020885467529296875,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.1317,
"reward": 0.34097065031528473,
"reward_std": 0.7731717061251402,
"rewards/cosine_scaled_reward": -0.14201467484235764,
"rewards/format_reward": 0.6250000037252903,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 3396.2500610351562,
"epoch": 0.012571428571428572,
"grad_norm": 0.026943808421492577,
"kl": 0.001140594482421875,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0515,
"reward": 0.5971196368336678,
"reward_std": 1.561735987663269,
"rewards/cosine_scaled_reward": 0.11105982027947903,
"rewards/format_reward": 0.3750000037252903,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 2507.4166870117188,
"epoch": 0.013142857142857144,
"grad_norm": 0.034301355481147766,
"kl": 0.0014629364013671875,
"learning_rate": 4.6e-07,
"loss": -0.0334,
"reward": -0.025607489049434662,
"reward_std": 0.31506185978651047,
"rewards/cosine_scaled_reward": -0.2836370915174484,
"rewards/format_reward": 0.5416666679084301,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 2955.6250610351562,
"epoch": 0.013714285714285714,
"grad_norm": 0.027797186747193336,
"kl": 0.0014705657958984375,
"learning_rate": 4.8e-07,
"loss": -0.0115,
"reward": 0.3966440111398697,
"reward_std": 0.6872580870985985,
"rewards/cosine_scaled_reward": -0.05167800933122635,
"rewards/format_reward": 0.5000000111758709,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1711.7083435058594,
"epoch": 0.014285714285714285,
"grad_norm": 0.06799926608800888,
"kl": 0.001068115234375,
"learning_rate": 5e-07,
"loss": -0.0482,
"reward": 0.8884695172309875,
"reward_std": 0.5825694352388382,
"rewards/cosine_scaled_reward": 0.06923475116491318,
"rewards/format_reward": 0.75,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 2803.5000610351562,
"epoch": 0.014857142857142857,
"grad_norm": 0.03911421447992325,
"kl": 0.0014438629150390625,
"learning_rate": 5.2e-07,
"loss": 0.1519,
"reward": -0.0517643466591835,
"reward_std": 0.6173615604639053,
"rewards/cosine_scaled_reward": -0.23421551939100027,
"rewards/format_reward": 0.4166666679084301,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 3351.0,
"epoch": 0.015428571428571429,
"grad_norm": 0.026781782507896423,
"kl": 0.0013971328735351562,
"learning_rate": 5.4e-07,
"loss": 0.0695,
"reward": -0.03075601067394018,
"reward_std": 0.5942016709595919,
"rewards/cosine_scaled_reward": -0.11954466812312603,
"rewards/format_reward": 0.2083333358168602,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2848.125030517578,
"epoch": 0.016,
"grad_norm": 0.04229135066270828,
"kl": 0.0016155242919921875,
"learning_rate": 5.6e-07,
"loss": 0.2301,
"reward": -0.033370375633239746,
"reward_std": 0.5533445682376623,
"rewards/cosine_scaled_reward": -0.20418519992381334,
"rewards/format_reward": 0.3750000149011612,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2896.791717529297,
"epoch": 0.01657142857142857,
"grad_norm": 0.03892264515161514,
"kl": 0.00153350830078125,
"learning_rate": 5.8e-07,
"loss": 0.0527,
"reward": 0.27996748499572277,
"reward_std": 0.7234907858073711,
"rewards/cosine_scaled_reward": -0.005849597975611687,
"rewards/format_reward": 0.2916666679084301,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 3474.4166870117188,
"epoch": 0.017142857142857144,
"grad_norm": 0.030040526762604713,
"kl": 0.0013484954833984375,
"learning_rate": 6e-07,
"loss": 0.0609,
"reward": -0.5745779648423195,
"reward_std": 0.2702234713360667,
"rewards/cosine_scaled_reward": -0.30812231451272964,
"rewards/format_reward": 0.0416666679084301,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2529.4583587646484,
"epoch": 0.017714285714285714,
"grad_norm": 0.07664818316698074,
"kl": 0.0016422271728515625,
"learning_rate": 6.2e-07,
"loss": 0.2026,
"reward": 0.09801799664273858,
"reward_std": 0.5707120858132839,
"rewards/cosine_scaled_reward": -0.11765768378973007,
"rewards/format_reward": 0.3333333432674408,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2432.4583435058594,
"epoch": 0.018285714285714287,
"grad_norm": 0.04763997718691826,
"kl": 0.0012264251708984375,
"learning_rate": 6.4e-07,
"loss": 0.0505,
"reward": 0.5702953189611435,
"reward_std": 0.7498441711068153,
"rewards/cosine_scaled_reward": 0.055980995297431946,
"rewards/format_reward": 0.4583333432674408,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.018857142857142857,
"grad_norm": 0.023195698857307434,
"kl": 0.00159454345703125,
"learning_rate": 6.6e-07,
"loss": 0.0001,
"reward": -0.4679257199168205,
"reward_std": 0.1712424661964178,
"rewards/cosine_scaled_reward": -0.23396285995841026,
"rewards/format_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 2902.416717529297,
"epoch": 0.019428571428571427,
"grad_norm": 0.0315682515501976,
"kl": 0.00136566162109375,
"learning_rate": 6.800000000000001e-07,
"loss": -0.0611,
"reward": 0.13882827758789062,
"reward_std": 0.6502545811235905,
"rewards/cosine_scaled_reward": -0.13891921937465668,
"rewards/format_reward": 0.4166666716337204,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 2983.041717529297,
"epoch": 0.02,
"grad_norm": 0.03658095747232437,
"kl": 0.0015888214111328125,
"learning_rate": 7e-07,
"loss": 0.1193,
"reward": -0.31272366642951965,
"reward_std": 0.4835015330463648,
"rewards/cosine_scaled_reward": -0.3021951597183943,
"rewards/format_reward": 0.291666679084301,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2957.6251220703125,
"epoch": 0.02057142857142857,
"grad_norm": 0.05824202671647072,
"kl": 0.0018100738525390625,
"learning_rate": 7.2e-07,
"loss": 0.1881,
"reward": 0.15943543054163456,
"reward_std": 0.5593209117650986,
"rewards/cosine_scaled_reward": -0.08694895729422569,
"rewards/format_reward": 0.3333333358168602,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 3325.2500610351562,
"epoch": 0.021142857142857144,
"grad_norm": 0.028705811128020287,
"kl": 0.0011510848999023438,
"learning_rate": 7.4e-07,
"loss": 0.0815,
"reward": 0.40311866998672485,
"reward_std": 1.144520342350006,
"rewards/cosine_scaled_reward": 0.07655936572700739,
"rewards/format_reward": 0.2500000074505806,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 3093.8333740234375,
"epoch": 0.021714285714285714,
"grad_norm": 0.02816646546125412,
"kl": 0.001506805419921875,
"learning_rate": 7.599999999999999e-07,
"loss": -0.0294,
"reward": 0.00928196799941361,
"reward_std": 0.8401262536644936,
"rewards/cosine_scaled_reward": -0.1620256956666708,
"rewards/format_reward": 0.3333333358168602,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1561.625015258789,
"epoch": 0.022285714285714287,
"grad_norm": 0.05273086577653885,
"kl": 0.0020198822021484375,
"learning_rate": 7.799999999999999e-07,
"loss": 0.3263,
"reward": 0.7025961354374886,
"reward_std": 0.578144907951355,
"rewards/cosine_scaled_reward": -0.04453528253361583,
"rewards/format_reward": 0.791666679084301,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 3062.916717529297,
"epoch": 0.022857142857142857,
"grad_norm": 0.035854943096637726,
"kl": 0.0014286041259765625,
"learning_rate": 8e-07,
"loss": 0.1176,
"reward": -0.38635121658444405,
"reward_std": 0.42415110766887665,
"rewards/cosine_scaled_reward": -0.2973422706127167,
"rewards/format_reward": 0.2083333358168602,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2880.2916717529297,
"epoch": 0.023428571428571427,
"grad_norm": 0.03968192636966705,
"kl": 0.0013332366943359375,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0179,
"reward": -0.15289244055747986,
"reward_std": 0.29260800313204527,
"rewards/cosine_scaled_reward": -0.20144624263048172,
"rewards/format_reward": 0.25,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2456.666748046875,
"epoch": 0.024,
"grad_norm": 0.04558291286230087,
"kl": 0.0011501312255859375,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0702,
"reward": 0.1457701325416565,
"reward_std": 0.6057947687804699,
"rewards/cosine_scaled_reward": -0.15628159046173096,
"rewards/format_reward": 0.4583333395421505,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 3518.5,
"epoch": 0.02457142857142857,
"grad_norm": 0.02695702761411667,
"kl": 0.0015354156494140625,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0386,
"reward": -0.4721556343138218,
"reward_std": 0.2661081962287426,
"rewards/cosine_scaled_reward": -0.2569111529737711,
"rewards/format_reward": 0.0416666679084301,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 3359.0833740234375,
"epoch": 0.025142857142857144,
"grad_norm": 0.03516784682869911,
"kl": 0.0012874603271484375,
"learning_rate": 8.799999999999999e-07,
"loss": 0.1016,
"reward": -0.2815104126930237,
"reward_std": 0.6340971514582634,
"rewards/cosine_scaled_reward": -0.203255208209157,
"rewards/format_reward": 0.1250000037252903,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3275.0000610351562,
"epoch": 0.025714285714285714,
"grad_norm": 0.030859854072332382,
"kl": 0.0014677047729492188,
"learning_rate": 9e-07,
"loss": 0.1469,
"reward": -0.4607398062944412,
"reward_std": 0.46918192505836487,
"rewards/cosine_scaled_reward": -0.3345365710556507,
"rewards/format_reward": 0.2083333358168602,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 3325.0001220703125,
"epoch": 0.026285714285714287,
"grad_norm": 0.029891135171055794,
"kl": 0.001445770263671875,
"learning_rate": 9.2e-07,
"loss": 0.0885,
"reward": 0.2845914401113987,
"reward_std": 1.531025379896164,
"rewards/cosine_scaled_reward": -0.024370940402150154,
"rewards/format_reward": 0.3333333395421505,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2723.0416717529297,
"epoch": 0.026857142857142857,
"grad_norm": 0.03254034370183945,
"kl": 0.0010290145874023438,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0325,
"reward": 0.6241510137915611,
"reward_std": 0.6462167936842889,
"rewards/cosine_scaled_reward": 0.10374213987961411,
"rewards/format_reward": 0.4166666716337204,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 2260.2500610351562,
"epoch": 0.027428571428571427,
"grad_norm": 0.03862646967172623,
"kl": 0.0012197494506835938,
"learning_rate": 9.6e-07,
"loss": 0.1572,
"reward": 0.872937873005867,
"reward_std": 0.9012494832277298,
"rewards/cosine_scaled_reward": 0.10313559230417013,
"rewards/format_reward": 0.6666666716337204,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2530.4583435058594,
"epoch": 0.028,
"grad_norm": 0.04415576159954071,
"kl": 0.0015926361083984375,
"learning_rate": 9.8e-07,
"loss": 0.0785,
"reward": 0.21386761963367462,
"reward_std": 0.4332967512309551,
"rewards/cosine_scaled_reward": -0.1222328469157219,
"rewards/format_reward": 0.4583333432674408,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 3114.2916870117188,
"epoch": 0.02857142857142857,
"grad_norm": 0.029501451179385185,
"kl": 0.0013103485107421875,
"learning_rate": 1e-06,
"loss": 0.0703,
"reward": 0.1049373559653759,
"reward_std": 0.7344531863927841,
"rewards/cosine_scaled_reward": -0.1141980029642582,
"rewards/format_reward": 0.3333333432674408,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2840.1666717529297,
"epoch": 0.029142857142857144,
"grad_norm": 0.034089185297489166,
"kl": 0.0012674331665039062,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0221,
"reward": 0.22294384241104126,
"reward_std": 0.19924288801848888,
"rewards/cosine_scaled_reward": -0.013528086245059967,
"rewards/format_reward": 0.25,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3554.7916870117188,
"epoch": 0.029714285714285714,
"grad_norm": 0.023885369300842285,
"kl": 0.00128173828125,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0071,
"reward": -0.5025612600147724,
"reward_std": 0.5909206122159958,
"rewards/cosine_scaled_reward": -0.3137806262820959,
"rewards/format_reward": 0.1250000037252903,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 3291.7916870117188,
"epoch": 0.030285714285714287,
"grad_norm": 0.02955654263496399,
"kl": 0.0012569427490234375,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0715,
"reward": 0.20391665399074554,
"reward_std": 0.6738939415663481,
"rewards/cosine_scaled_reward": -0.0022083488292992115,
"rewards/format_reward": 0.2083333395421505,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 3290.125,
"epoch": 0.030857142857142857,
"grad_norm": 0.029282676056027412,
"kl": 0.0015697479248046875,
"learning_rate": 9.998245517681593e-07,
"loss": -0.0018,
"reward": -0.35863614082336426,
"reward_std": 0.27485933527350426,
"rewards/cosine_scaled_reward": -0.2834847420454025,
"rewards/format_reward": 0.2083333432674408,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 2973.8333435058594,
"epoch": 0.03142857142857143,
"grad_norm": 0.03359178826212883,
"kl": 0.0014543533325195312,
"learning_rate": 9.997258721585931e-07,
"loss": -0.0819,
"reward": -0.011329814791679382,
"reward_std": 0.7738963216543198,
"rewards/cosine_scaled_reward": -0.17233159765601158,
"rewards/format_reward": 0.3333333358168602,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 3300.0833740234375,
"epoch": 0.032,
"grad_norm": 0.03317515552043915,
"kl": 0.001972198486328125,
"learning_rate": 9.996052735444862e-07,
"loss": 0.043,
"reward": -0.09182104840874672,
"reward_std": 0.7133485153317451,
"rewards/cosine_scaled_reward": -0.21257717907428741,
"rewards/format_reward": 0.3333333395421505,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2609.7083435058594,
"epoch": 0.03257142857142857,
"grad_norm": 0.060632482171058655,
"kl": 0.0012912750244140625,
"learning_rate": 9.994627618036452e-07,
"loss": 0.1621,
"reward": -0.09009268879890442,
"reward_std": 0.6251031160354614,
"rewards/cosine_scaled_reward": -0.25337968301028013,
"rewards/format_reward": 0.416666679084301,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 3296.166748046875,
"epoch": 0.03314285714285714,
"grad_norm": 0.0355277955532074,
"kl": 0.0013217926025390625,
"learning_rate": 9.992983438818915e-07,
"loss": 0.1188,
"reward": -0.2258252203464508,
"reward_std": 0.6621097773313522,
"rewards/cosine_scaled_reward": -0.2170792818069458,
"rewards/format_reward": 0.2083333358168602,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 3051.9166870117188,
"epoch": 0.03371428571428572,
"grad_norm": 0.05533614382147789,
"kl": 0.0016040802001953125,
"learning_rate": 9.991120277927223e-07,
"loss": 0.1478,
"reward": -0.11343428725376725,
"reward_std": 0.8285746909677982,
"rewards/cosine_scaled_reward": -0.18171714432537556,
"rewards/format_reward": 0.2500000037252903,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 3533.5833740234375,
"epoch": 0.03428571428571429,
"grad_norm": 0.022478297352790833,
"kl": 0.0011472702026367188,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0148,
"reward": -0.06657149642705917,
"reward_std": 0.702279582619667,
"rewards/cosine_scaled_reward": -0.11661908403038979,
"rewards/format_reward": 0.1666666679084301,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 2781.2916717529297,
"epoch": 0.03485714285714286,
"grad_norm": 0.03732416406273842,
"kl": 0.0013170242309570312,
"learning_rate": 9.98673738502114e-07,
"loss": -0.0083,
"reward": 0.24037758260965347,
"reward_std": 0.6484577842056751,
"rewards/cosine_scaled_reward": -0.025644555687904358,
"rewards/format_reward": 0.2916666679084301,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2997.5416870117188,
"epoch": 0.03542857142857143,
"grad_norm": 0.04709284380078316,
"kl": 0.0023193359375,
"learning_rate": 9.98421786662277e-07,
"loss": 0.196,
"reward": -0.4253711402416229,
"reward_std": 0.31404080241918564,
"rewards/cosine_scaled_reward": -0.33768558874726295,
"rewards/format_reward": 0.2500000111758709,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 3549.2083740234375,
"epoch": 0.036,
"grad_norm": 0.03583931922912598,
"kl": 0.001361846923828125,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0204,
"reward": -0.09316671267151833,
"reward_std": 0.7212427258491516,
"rewards/cosine_scaled_reward": -0.088250030297786,
"rewards/format_reward": 0.0833333358168602,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.036571428571428574,
"grad_norm": 0.027038326486945152,
"kl": 0.0012288093566894531,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0,
"reward": -0.45635369420051575,
"reward_std": 0.48907893151044846,
"rewards/cosine_scaled_reward": -0.24901018291711807,
"rewards/format_reward": 0.0416666679084301,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 3516.0833740234375,
"epoch": 0.037142857142857144,
"grad_norm": 0.026061108335852623,
"kl": 0.0015888214111328125,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0368,
"reward": -0.3072509989142418,
"reward_std": 0.5022665746510029,
"rewards/cosine_scaled_reward": -0.2161255106329918,
"rewards/format_reward": 0.1250000037252903,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 3065.3750610351562,
"epoch": 0.037714285714285714,
"grad_norm": 0.03897922486066818,
"kl": 0.0011157989501953125,
"learning_rate": 9.971955636222684e-07,
"loss": 0.1042,
"reward": 0.6038293391466141,
"reward_std": 1.0056462287902832,
"rewards/cosine_scaled_reward": 0.09358132258057594,
"rewards/format_reward": 0.416666679084301,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 2558.0416717529297,
"epoch": 0.038285714285714284,
"grad_norm": 0.0602027028799057,
"kl": 0.0016117095947265625,
"learning_rate": 9.968344786479415e-07,
"loss": 0.1471,
"reward": -0.20528408139944077,
"reward_std": 0.48294083029031754,
"rewards/cosine_scaled_reward": -0.2693087123334408,
"rewards/format_reward": 0.3333333358168602,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.038857142857142854,
"grad_norm": 0.022868435829877853,
"kl": 0.0010528564453125,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0,
"reward": -0.4899524673819542,
"reward_std": 0.18665786273777485,
"rewards/cosine_scaled_reward": -0.2449762411415577,
"rewards/format_reward": 0.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 2950.1250610351562,
"epoch": 0.03942857142857143,
"grad_norm": 0.02722679264843464,
"kl": 0.0013837814331054688,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0011,
"reward": 0.08557448768988252,
"reward_std": 0.5314780101180077,
"rewards/cosine_scaled_reward": -0.16554608941078186,
"rewards/format_reward": 0.4166666716337204,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2750.3333740234375,
"epoch": 0.04,
"grad_norm": 0.03665938600897789,
"kl": 0.0014772415161132812,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0262,
"reward": 0.0849397610872984,
"reward_std": 0.4952346533536911,
"rewards/cosine_scaled_reward": -0.12419677525758743,
"rewards/format_reward": 0.3333333358168602,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 2428.0834045410156,
"epoch": 0.04057142857142857,
"grad_norm": 0.04519721865653992,
"kl": 0.0012788772583007812,
"learning_rate": 9.951725498333448e-07,
"loss": 0.121,
"reward": 0.773456797003746,
"reward_std": 0.9204451106488705,
"rewards/cosine_scaled_reward": 0.11589503288269043,
"rewards/format_reward": 0.541666679084301,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 2911.0833740234375,
"epoch": 0.04114285714285714,
"grad_norm": 0.028104083612561226,
"kl": 0.0014371871948242188,
"learning_rate": 9.947027716509488e-07,
"loss": -0.0198,
"reward": 0.25113654136657715,
"reward_std": 0.21677139215171337,
"rewards/cosine_scaled_reward": -0.020265087485313416,
"rewards/format_reward": 0.2916666679084301,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2810.8750610351562,
"epoch": 0.04171428571428572,
"grad_norm": 0.08543611317873001,
"kl": 0.0014705657958984375,
"learning_rate": 9.942113192828444e-07,
"loss": 0.1751,
"reward": 0.2932465523481369,
"reward_std": 0.7045526169240475,
"rewards/cosine_scaled_reward": -0.14504339545965195,
"rewards/format_reward": 0.5833333432674408,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2700.125030517578,
"epoch": 0.04228571428571429,
"grad_norm": 0.03808082640171051,
"kl": 0.0013132095336914062,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0276,
"reward": 0.33640212565660477,
"reward_std": 0.7232047636061907,
"rewards/cosine_scaled_reward": -0.040132271125912666,
"rewards/format_reward": 0.4166666716337204,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 2641.0000610351562,
"epoch": 0.04285714285714286,
"grad_norm": 0.0486847385764122,
"kl": 0.001522064208984375,
"learning_rate": 9.931634888554935e-07,
"loss": 0.1158,
"reward": 0.4397294968366623,
"reward_std": 0.978052169084549,
"rewards/cosine_scaled_reward": -0.07180194836109877,
"rewards/format_reward": 0.5833333432674408,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 3285.25,
"epoch": 0.04342857142857143,
"grad_norm": 0.025055555626749992,
"kl": 0.0014476776123046875,
"learning_rate": 9.926071618660237e-07,
"loss": -0.0014,
"reward": -0.34721991792321205,
"reward_std": 0.4885272663086653,
"rewards/cosine_scaled_reward": -0.2777766231447458,
"rewards/format_reward": 0.2083333395421505,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 3071.375,
"epoch": 0.044,
"grad_norm": 0.028431611135601997,
"kl": 0.0015621185302734375,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0553,
"reward": 0.2637963071465492,
"reward_std": 0.6890296600759029,
"rewards/cosine_scaled_reward": -0.05560185760259628,
"rewards/format_reward": 0.3750000149011612,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 2630.666717529297,
"epoch": 0.044571428571428574,
"grad_norm": 0.059259917587041855,
"kl": 0.0013217926025390625,
"learning_rate": 9.91429819907136e-07,
"loss": 0.2308,
"reward": 0.18861320614814758,
"reward_std": 0.9464646428823471,
"rewards/cosine_scaled_reward": -0.1140267364680767,
"rewards/format_reward": 0.4166666828095913,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 2445.2084045410156,
"epoch": 0.045142857142857144,
"grad_norm": 0.05578162893652916,
"kl": 0.0013370513916015625,
"learning_rate": 9.908088623197048e-07,
"loss": 0.1857,
"reward": 0.2249200213700533,
"reward_std": 0.43778470158576965,
"rewards/cosine_scaled_reward": -0.11670666188001633,
"rewards/format_reward": 0.4583333358168602,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 3495.2500610351562,
"epoch": 0.045714285714285714,
"grad_norm": 0.023275911808013916,
"kl": 0.0011081695556640625,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0162,
"reward": 0.07009781152009964,
"reward_std": 0.6233401894569397,
"rewards/cosine_scaled_reward": -0.08995110169053078,
"rewards/format_reward": 0.2500000111758709,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 3294.7500610351562,
"epoch": 0.046285714285714284,
"grad_norm": 0.028695331886410713,
"kl": 0.0014467239379882812,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0722,
"reward": 0.4441650025546551,
"reward_std": 0.673637431114912,
"rewards/cosine_scaled_reward": 0.03458251152187586,
"rewards/format_reward": 0.3750000074505806,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 3106.8333435058594,
"epoch": 0.046857142857142854,
"grad_norm": 0.027711963281035423,
"kl": 0.0012807846069335938,
"learning_rate": 9.888172094375033e-07,
"loss": -0.0678,
"reward": 0.031808093190193176,
"reward_std": 0.23199644684791565,
"rewards/cosine_scaled_reward": -0.1090959757566452,
"rewards/format_reward": 0.25,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1248.2083358764648,
"epoch": 0.04742857142857143,
"grad_norm": 0.08828004449605942,
"kl": 0.001712799072265625,
"learning_rate": 9.881105062929221e-07,
"loss": 0.2453,
"reward": 1.3891001343727112,
"reward_std": 0.4791063070297241,
"rewards/cosine_scaled_reward": 0.2570500001311302,
"rewards/format_reward": 0.8750000149011612,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 2866.9166870117188,
"epoch": 0.048,
"grad_norm": 0.026806967332959175,
"kl": 0.0012722015380859375,
"learning_rate": 9.873824502603459e-07,
"loss": -0.0688,
"reward": 0.062408581376075745,
"reward_std": 0.5405701324343681,
"rewards/cosine_scaled_reward": -0.19796237349510193,
"rewards/format_reward": 0.4583333395421505,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 3018.4583435058594,
"epoch": 0.04857142857142857,
"grad_norm": 0.03342577815055847,
"kl": 0.001705169677734375,
"learning_rate": 9.866330768241983e-07,
"loss": -0.0619,
"reward": -0.14163251221179962,
"reward_std": 0.4410179667174816,
"rewards/cosine_scaled_reward": -0.1958162598311901,
"rewards/format_reward": 0.25,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 3378.0833740234375,
"epoch": 0.04914285714285714,
"grad_norm": 0.028499940410256386,
"kl": 0.0012416839599609375,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0492,
"reward": 0.010917209088802338,
"reward_std": 1.1770060509443283,
"rewards/cosine_scaled_reward": -0.14037471916526556,
"rewards/format_reward": 0.2916666716337204,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 3004.5416870117188,
"epoch": 0.04971428571428571,
"grad_norm": 0.05726942792534828,
"kl": 0.0015773773193359375,
"learning_rate": 9.850705248720068e-07,
"loss": 0.1257,
"reward": 0.2275838926434517,
"reward_std": 0.23925334960222244,
"rewards/cosine_scaled_reward": -0.011208053678274155,
"rewards/format_reward": 0.25,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 3160.541717529297,
"epoch": 0.05028571428571429,
"grad_norm": 0.028790749609470367,
"kl": 0.0017261505126953125,
"learning_rate": 9.8425742251254e-07,
"loss": 0.033,
"reward": -0.2314395122230053,
"reward_std": 0.8384756073355675,
"rewards/cosine_scaled_reward": -0.261553093791008,
"rewards/format_reward": 0.291666679084301,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2860.6250610351562,
"epoch": 0.05085714285714286,
"grad_norm": 0.06211750581860542,
"kl": 0.00139617919921875,
"learning_rate": 9.83423155058946e-07,
"loss": 0.1842,
"reward": 0.014451097697019577,
"reward_std": 1.1666484847664833,
"rewards/cosine_scaled_reward": -0.18027446931228042,
"rewards/format_reward": 0.3750000149011612,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2257.7501220703125,
"epoch": 0.05142857142857143,
"grad_norm": 0.028858494013547897,
"kl": 0.0010528564453125,
"learning_rate": 9.825677631722435e-07,
"loss": -0.0056,
"reward": 1.5131540820002556,
"reward_std": 0.8703325614333153,
"rewards/cosine_scaled_reward": 0.3399103432893753,
"rewards/format_reward": 0.8333333358168602,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 3098.2916870117188,
"epoch": 0.052,
"grad_norm": 0.06066668778657913,
"kl": 0.0015430450439453125,
"learning_rate": 9.816912885430258e-07,
"loss": 0.099,
"reward": 0.3656032606959343,
"reward_std": 0.5041490569710732,
"rewards/cosine_scaled_reward": -0.004698362201452255,
"rewards/format_reward": 0.375,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2801.875,
"epoch": 0.052571428571428575,
"grad_norm": 0.03467201441526413,
"kl": 0.0012559890747070312,
"learning_rate": 9.807937738894303e-07,
"loss": 0.016,
"reward": 0.3106018081307411,
"reward_std": 0.1894020326435566,
"rewards/cosine_scaled_reward": 0.03030090034008026,
"rewards/format_reward": 0.25,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2385.8333587646484,
"epoch": 0.053142857142857144,
"grad_norm": 0.03670130670070648,
"kl": 0.001338958740234375,
"learning_rate": 9.798752629550546e-07,
"loss": -0.0442,
"reward": 0.30549725890159607,
"reward_std": 0.6037918105721474,
"rewards/cosine_scaled_reward": -0.09725138684734702,
"rewards/format_reward": 0.5,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 2975.1250610351562,
"epoch": 0.053714285714285714,
"grad_norm": 0.05462236329913139,
"kl": 0.0011692047119140625,
"learning_rate": 9.78935800506826e-07,
"loss": 0.1892,
"reward": 0.3027478810399771,
"reward_std": 0.8827195726335049,
"rewards/cosine_scaled_reward": -0.11945939064025879,
"rewards/format_reward": 0.5416666865348816,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 2339.7083435058594,
"epoch": 0.054285714285714284,
"grad_norm": 0.06422823667526245,
"kl": 0.0014171600341796875,
"learning_rate": 9.779754323328192e-07,
"loss": 0.2833,
"reward": 0.4383026659488678,
"reward_std": 1.0743879303336143,
"rewards/cosine_scaled_reward": -0.07251531630754471,
"rewards/format_reward": 0.5833333544433117,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3315.8333740234375,
"epoch": 0.054857142857142854,
"grad_norm": 0.030405810102820396,
"kl": 0.0014019012451171875,
"learning_rate": 9.769942052400235e-07,
"loss": 0.1029,
"reward": -0.1838020607829094,
"reward_std": 0.779555544257164,
"rewards/cosine_scaled_reward": -0.1752343699336052,
"rewards/format_reward": 0.1666666679084301,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 2464.125030517578,
"epoch": 0.05542857142857143,
"grad_norm": 0.12049802392721176,
"kl": 0.0021877288818359375,
"learning_rate": 9.759921670520634e-07,
"loss": 0.4307,
"reward": -0.31846501119434834,
"reward_std": 0.37730174139142036,
"rewards/cosine_scaled_reward": -0.3467325195670128,
"rewards/format_reward": 0.3750000149011612,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1858.3333587646484,
"epoch": 0.056,
"grad_norm": 0.049949761480093,
"kl": 0.0011882781982421875,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0603,
"reward": 1.1145238876342773,
"reward_std": 1.2516061216592789,
"rewards/cosine_scaled_reward": 0.22392860241234303,
"rewards/format_reward": 0.666666679084301,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2434.2083740234375,
"epoch": 0.05657142857142857,
"grad_norm": 0.053174033761024475,
"kl": 0.001430511474609375,
"learning_rate": 9.739258537542835e-07,
"loss": 0.1068,
"reward": 0.8620769158005714,
"reward_std": 0.5291388845071197,
"rewards/cosine_scaled_reward": 0.20187175273895264,
"rewards/format_reward": 0.4583333395421505,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 1838.0417098999023,
"epoch": 0.05714285714285714,
"grad_norm": 0.04943283274769783,
"kl": 0.0014743804931640625,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0195,
"reward": 0.9763878043740988,
"reward_std": 1.0005781203508377,
"rewards/cosine_scaled_reward": 0.13402722217142582,
"rewards/format_reward": 0.7083333395421505,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 3057.041748046875,
"epoch": 0.05771428571428571,
"grad_norm": 0.03174147754907608,
"kl": 0.0012950897216796875,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0462,
"reward": -0.19494443386793137,
"reward_std": 0.7306992970407009,
"rewards/cosine_scaled_reward": -0.2641388811171055,
"rewards/format_reward": 0.3333333358168602,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 3456.5833740234375,
"epoch": 0.05828571428571429,
"grad_norm": 0.045808080583810806,
"kl": 0.001529693603515625,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0444,
"reward": -0.422076272778213,
"reward_std": 0.645493321120739,
"rewards/cosine_scaled_reward": -0.29437146335840225,
"rewards/format_reward": 0.1666666716337204,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 2761.7500610351562,
"epoch": 0.05885714285714286,
"grad_norm": 0.041905004531145096,
"kl": 0.001399993896484375,
"learning_rate": 9.695457105469804e-07,
"loss": 0.2441,
"reward": -0.21907871961593628,
"reward_std": 0.6393478140234947,
"rewards/cosine_scaled_reward": -0.27620603516697884,
"rewards/format_reward": 0.3333333469927311,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 3128.7083435058594,
"epoch": 0.05942857142857143,
"grad_norm": 0.0318993479013443,
"kl": 0.0018177032470703125,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0306,
"reward": -0.16584369540214539,
"reward_std": 0.49407806620001793,
"rewards/cosine_scaled_reward": -0.1870885267853737,
"rewards/format_reward": 0.2083333432674408,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2826.5833740234375,
"epoch": 0.06,
"grad_norm": 0.045003149658441544,
"kl": 0.001453399658203125,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0077,
"reward": -0.16263014823198318,
"reward_std": 0.4682058170437813,
"rewards/cosine_scaled_reward": -0.2688150703907013,
"rewards/format_reward": 0.375,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 2974.9583435058594,
"epoch": 0.060571428571428575,
"grad_norm": 0.035350456833839417,
"kl": 0.0015954971313476562,
"learning_rate": 9.66045715125541e-07,
"loss": 0.033,
"reward": -0.07076919078826904,
"reward_std": 0.32792995125055313,
"rewards/cosine_scaled_reward": -0.16038458794355392,
"rewards/format_reward": 0.25,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2394.7916870117188,
"epoch": 0.061142857142857145,
"grad_norm": 0.045548874884843826,
"kl": 0.0016841888427734375,
"learning_rate": 9.648384182148252e-07,
"loss": 0.1952,
"reward": -0.13444633036851883,
"reward_std": 0.30743784829974174,
"rewards/cosine_scaled_reward": -0.2755565196275711,
"rewards/format_reward": 0.4166666716337204,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2070.1666870117188,
"epoch": 0.061714285714285715,
"grad_norm": 0.037449728697538376,
"kl": 0.0013103485107421875,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0441,
"reward": 0.29970118403434753,
"reward_std": 0.39979302138090134,
"rewards/cosine_scaled_reward": -0.12098274752497673,
"rewards/format_reward": 0.5416666679084301,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 3472.875,
"epoch": 0.062285714285714285,
"grad_norm": 0.03282066062092781,
"kl": 0.0014896392822265625,
"learning_rate": 9.623632283030077e-07,
"loss": -0.01,
"reward": -0.5779595132917166,
"reward_std": 0.1550980880856514,
"rewards/cosine_scaled_reward": -0.33064643293619156,
"rewards/format_reward": 0.0833333358168602,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 3204.4166870117188,
"epoch": 0.06285714285714286,
"grad_norm": 0.03465156629681587,
"kl": 0.0013942718505859375,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0759,
"reward": -0.19752079993486404,
"reward_std": 0.5277009457349777,
"rewards/cosine_scaled_reward": -0.22376039996743202,
"rewards/format_reward": 0.2500000111758709,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2833.8333435058594,
"epoch": 0.06342857142857143,
"grad_norm": 0.038389332592487335,
"kl": 0.0014247894287109375,
"learning_rate": 9.598076473627796e-07,
"loss": -0.0197,
"reward": 0.04367838054895401,
"reward_std": 0.40750692412257195,
"rewards/cosine_scaled_reward": -0.1031608060002327,
"rewards/format_reward": 0.25,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 3353.0416870117188,
"epoch": 0.064,
"grad_norm": 0.03541194647550583,
"kl": 0.001300811767578125,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0513,
"reward": 0.07440860942006111,
"reward_std": 0.49543108604848385,
"rewards/cosine_scaled_reward": -0.046129053458571434,
"rewards/format_reward": 0.1666666716337204,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 3231.625,
"epoch": 0.06457142857142857,
"grad_norm": 0.028411859646439552,
"kl": 0.0015411376953125,
"learning_rate": 9.571721736097088e-07,
"loss": 0.066,
"reward": -0.06201314926147461,
"reward_std": 0.7078926898539066,
"rewards/cosine_scaled_reward": -0.1143399253487587,
"rewards/format_reward": 0.1666666679084301,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 3501.2083740234375,
"epoch": 0.06514285714285714,
"grad_norm": 0.024705398827791214,
"kl": 0.0011453628540039062,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0279,
"reward": -0.3616723045706749,
"reward_std": 0.6481901705265045,
"rewards/cosine_scaled_reward": -0.24333615973591805,
"rewards/format_reward": 0.1250000037252903,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2867.500030517578,
"epoch": 0.06571428571428571,
"grad_norm": 0.04104912653565407,
"kl": 0.00212860107421875,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0948,
"reward": 0.17307260632514954,
"reward_std": 0.8984426707029343,
"rewards/cosine_scaled_reward": -0.08013037219643593,
"rewards/format_reward": 0.3333333469927311,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 3490.5833740234375,
"epoch": 0.06628571428571428,
"grad_norm": 0.02605007216334343,
"kl": 0.0013637542724609375,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0418,
"reward": -0.7712523862719536,
"reward_std": 0.16090214252471924,
"rewards/cosine_scaled_reward": -0.4064595252275467,
"rewards/format_reward": 0.0416666679084301,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 3180.4583740234375,
"epoch": 0.06685714285714285,
"grad_norm": 0.03222418949007988,
"kl": 0.0011768341064453125,
"learning_rate": 9.516636183034564e-07,
"loss": 0.1008,
"reward": 0.5754134804010391,
"reward_std": 0.9305911436676979,
"rewards/cosine_scaled_reward": 0.1002067094668746,
"rewards/format_reward": 0.3750000037252903,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 2891.2083435058594,
"epoch": 0.06742857142857143,
"grad_norm": 0.030473262071609497,
"kl": 0.0016193389892578125,
"learning_rate": 9.502373679810839e-07,
"loss": -0.0385,
"reward": -0.05577632784843445,
"reward_std": 0.5428225882351398,
"rewards/cosine_scaled_reward": -0.1737215220928192,
"rewards/format_reward": 0.2916666679084301,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2154.0,
"epoch": 0.068,
"grad_norm": 0.04909588396549225,
"kl": 0.0015277862548828125,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0493,
"reward": 0.5355237275362015,
"reward_std": 0.6489699464291334,
"rewards/cosine_scaled_reward": -0.003071460872888565,
"rewards/format_reward": 0.5416666679084301,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.06857142857142857,
"grad_norm": 0.025090977549552917,
"kl": 0.0014781951904296875,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0001,
"reward": -0.3828761875629425,
"reward_std": 0.5585887841880322,
"rewards/cosine_scaled_reward": -0.27477142587304115,
"rewards/format_reward": 0.1666666679084301,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 2850.75,
"epoch": 0.06914285714285714,
"grad_norm": 0.03808946907520294,
"kl": 0.0017147064208984375,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0068,
"reward": -0.03915686905384064,
"reward_std": 0.5140850841999054,
"rewards/cosine_scaled_reward": -0.14457844197750092,
"rewards/format_reward": 0.25,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1842.291732788086,
"epoch": 0.06971428571428571,
"grad_norm": 0.038628239184617996,
"kl": 0.0010824203491210938,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0693,
"reward": 1.3891835287213326,
"reward_std": 0.6790552884340286,
"rewards/cosine_scaled_reward": 0.2779250582680106,
"rewards/format_reward": 0.833333358168602,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 3182.416748046875,
"epoch": 0.07028571428571428,
"grad_norm": 0.029834022745490074,
"kl": 0.00135040283203125,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0647,
"reward": 0.1597345843911171,
"reward_std": 0.917301956564188,
"rewards/cosine_scaled_reward": -0.06596606969833374,
"rewards/format_reward": 0.2916666716337204,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 3428.3750610351562,
"epoch": 0.07085714285714285,
"grad_norm": 0.023122038692235947,
"kl": 0.0012035369873046875,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0502,
"reward": 0.11488033598288894,
"reward_std": 0.7012328114360571,
"rewards/cosine_scaled_reward": -0.06755982898175716,
"rewards/format_reward": 0.25,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2355.4584045410156,
"epoch": 0.07142857142857142,
"grad_norm": 0.07761310786008835,
"kl": 0.0016689300537109375,
"learning_rate": 9.397114317029974e-07,
"loss": 0.1346,
"reward": 0.3906384010333568,
"reward_std": 0.7293533496558666,
"rewards/cosine_scaled_reward": -0.07551417127251625,
"rewards/format_reward": 0.541666679084301,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1500.250015258789,
"epoch": 0.072,
"grad_norm": 0.04690569266676903,
"kl": 0.0012722015380859375,
"learning_rate": 9.381311511432658e-07,
"loss": 0.1106,
"reward": 0.39056021464057267,
"reward_std": 0.5676989816129208,
"rewards/cosine_scaled_reward": -0.20055323839187622,
"rewards/format_reward": 0.7916666679084301,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2584.500045776367,
"epoch": 0.07257142857142856,
"grad_norm": 0.03360402584075928,
"kl": 0.0013713836669921875,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0802,
"reward": 1.0075182765722275,
"reward_std": 0.7243328895419836,
"rewards/cosine_scaled_reward": 0.25375913130119443,
"rewards/format_reward": 0.5000000111758709,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 3354.666748046875,
"epoch": 0.07314285714285715,
"grad_norm": 0.023864390328526497,
"kl": 0.0013265609741210938,
"learning_rate": 9.34913917072228e-07,
"loss": -0.0441,
"reward": 0.49393532425165176,
"reward_std": 0.37806709855794907,
"rewards/cosine_scaled_reward": 0.08030100539326668,
"rewards/format_reward": 0.3333333358168602,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2486.2084045410156,
"epoch": 0.07371428571428572,
"grad_norm": 0.08251643180847168,
"kl": 0.0013523101806640625,
"learning_rate": 9.332771203643714e-07,
"loss": 0.1986,
"reward": 0.4050881192088127,
"reward_std": 0.4405330717563629,
"rewards/cosine_scaled_reward": -0.04745597392320633,
"rewards/format_reward": 0.5000000111758709,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 2505.5416870117188,
"epoch": 0.07428571428571429,
"grad_norm": 0.04071417823433876,
"kl": 0.0015363693237304688,
"learning_rate": 9.316216432703916e-07,
"loss": 0.2513,
"reward": 0.2988436222076416,
"reward_std": 1.0345312021672726,
"rewards/cosine_scaled_reward": -0.07974486611783504,
"rewards/format_reward": 0.4583333432674408,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 3195.500030517578,
"epoch": 0.07485714285714286,
"grad_norm": 0.03458356857299805,
"kl": 0.0014495849609375,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0254,
"reward": -0.5463367225602269,
"reward_std": 0.30226682126522064,
"rewards/cosine_scaled_reward": -0.3981683552265167,
"rewards/format_reward": 0.2500000111758709,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 2899.0000915527344,
"epoch": 0.07542857142857143,
"grad_norm": 0.03356115147471428,
"kl": 0.0011739730834960938,
"learning_rate": 9.282549715730579e-07,
"loss": 0.012,
"reward": 0.35284891817718744,
"reward_std": 0.7392313480377197,
"rewards/cosine_scaled_reward": -0.1360755565110594,
"rewards/format_reward": 0.6250000149011612,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 2948.7916870117188,
"epoch": 0.076,
"grad_norm": 0.04770968109369278,
"kl": 0.001735687255859375,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0917,
"reward": 0.43366559594869614,
"reward_std": 0.7803434580564499,
"rewards/cosine_scaled_reward": 0.008499465882778168,
"rewards/format_reward": 0.4166666865348816,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 3490.0834350585938,
"epoch": 0.07657142857142857,
"grad_norm": 0.02353997528553009,
"kl": 0.00106048583984375,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0353,
"reward": 0.307404475286603,
"reward_std": 0.8962692134082317,
"rewards/cosine_scaled_reward": 0.04953557066619396,
"rewards/format_reward": 0.2083333395421505,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2842.4583435058594,
"epoch": 0.07714285714285714,
"grad_norm": 0.04078147932887077,
"kl": 0.0019664764404296875,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0495,
"reward": -0.4577614963054657,
"reward_std": 0.24106378480792046,
"rewards/cosine_scaled_reward": -0.35388075560331345,
"rewards/format_reward": 0.25,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 2188.2500610351562,
"epoch": 0.07771428571428571,
"grad_norm": 0.0578879676759243,
"kl": 0.003261566162109375,
"learning_rate": 9.213010742252327e-07,
"loss": -0.0414,
"reward": 0.22367773577570915,
"reward_std": 0.5595778077840805,
"rewards/cosine_scaled_reward": -0.15899447072297335,
"rewards/format_reward": 0.5416666679084301,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 2250.7083435058594,
"epoch": 0.07828571428571429,
"grad_norm": 0.029618890956044197,
"kl": 0.001224517822265625,
"learning_rate": 9.195171441101668e-07,
"loss": -0.0034,
"reward": 0.39041774719953537,
"reward_std": 0.5479032769799232,
"rewards/cosine_scaled_reward": -0.03395777940750122,
"rewards/format_reward": 0.4583333432674408,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2288.5416870117188,
"epoch": 0.07885714285714286,
"grad_norm": 0.031847670674324036,
"kl": 0.0013990402221679688,
"learning_rate": 9.177152042508077e-07,
"loss": 0.1657,
"reward": 0.23090605437755585,
"reward_std": 0.1713680997490883,
"rewards/cosine_scaled_reward": -0.13454700261354446,
"rewards/format_reward": 0.5,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 3477.4584350585938,
"epoch": 0.07942857142857143,
"grad_norm": 0.029080698266625404,
"kl": 0.0010776519775390625,
"learning_rate": 9.158953424711624e-07,
"loss": 0.048,
"reward": -0.039053503423929214,
"reward_std": 0.8403101861476898,
"rewards/cosine_scaled_reward": -0.14452676009386778,
"rewards/format_reward": 0.2500000037252903,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 2914.875030517578,
"epoch": 0.08,
"grad_norm": 0.023377321660518646,
"kl": 0.0012502670288085938,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0661,
"reward": 0.3945077173411846,
"reward_std": 0.43620033375918865,
"rewards/cosine_scaled_reward": 0.030587172135710716,
"rewards/format_reward": 0.3333333358168602,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 2429.9584045410156,
"epoch": 0.08057142857142857,
"grad_norm": 0.060110971331596375,
"kl": 0.0018157958984375,
"learning_rate": 9.122022088101613e-07,
"loss": 0.2163,
"reward": 0.49823567643761635,
"reward_std": 0.9982109814882278,
"rewards/cosine_scaled_reward": -0.08421549946069717,
"rewards/format_reward": 0.6666666828095913,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 2689.5833587646484,
"epoch": 0.08114285714285714,
"grad_norm": 0.06225240230560303,
"kl": 0.0021820068359375,
"learning_rate": 9.103291169269299e-07,
"loss": -0.02,
"reward": 0.4396224841475487,
"reward_std": 0.9268415421247482,
"rewards/cosine_scaled_reward": -0.009355428628623486,
"rewards/format_reward": 0.4583333507180214,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2746.9166717529297,
"epoch": 0.08171428571428571,
"grad_norm": 0.04002835229039192,
"kl": 0.0014972686767578125,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0273,
"reward": 0.42774503864347935,
"reward_std": 0.5674133813008666,
"rewards/cosine_scaled_reward": 0.04720588028430939,
"rewards/format_reward": 0.3333333358168602,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 2831.875030517578,
"epoch": 0.08228571428571428,
"grad_norm": 0.039241962134838104,
"kl": 0.0017242431640625,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0095,
"reward": 0.6469228328205645,
"reward_std": 0.9649386182427406,
"rewards/cosine_scaled_reward": 0.03179474361240864,
"rewards/format_reward": 0.5833333469927311,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1920.7917022705078,
"epoch": 0.08285714285714285,
"grad_norm": 0.05307425558567047,
"kl": 0.003032684326171875,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0972,
"reward": 0.42246972769498825,
"reward_std": 0.7057302370667458,
"rewards/cosine_scaled_reward": -0.1012651463970542,
"rewards/format_reward": 0.6250000149011612,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 2315.666717529297,
"epoch": 0.08342857142857144,
"grad_norm": 0.03780323639512062,
"kl": 0.0018100738525390625,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0869,
"reward": 0.7690896391868591,
"reward_std": 0.908334705978632,
"rewards/cosine_scaled_reward": 0.05121150240302086,
"rewards/format_reward": 0.6666666865348816,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 2429.916748046875,
"epoch": 0.084,
"grad_norm": 0.03277585655450821,
"kl": 0.0013561248779296875,
"learning_rate": 9.007020842191634e-07,
"loss": 0.1537,
"reward": 0.36759535409510136,
"reward_std": 0.6812577322125435,
"rewards/cosine_scaled_reward": -0.04536898247897625,
"rewards/format_reward": 0.4583333358168602,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 2083.750045776367,
"epoch": 0.08457142857142858,
"grad_norm": 0.0774744376540184,
"kl": 0.0018138885498046875,
"learning_rate": 8.987250199168808e-07,
"loss": -0.0813,
"reward": 0.5623667687177658,
"reward_std": 0.5345336459577084,
"rewards/cosine_scaled_reward": -0.09381664916872978,
"rewards/format_reward": 0.75,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 3054.666748046875,
"epoch": 0.08514285714285715,
"grad_norm": 0.09238254278898239,
"kl": 0.0024662017822265625,
"learning_rate": 8.967309592491052e-07,
"loss": 0.1538,
"reward": 0.13933675922453403,
"reward_std": 0.8963729050010443,
"rewards/cosine_scaled_reward": -0.1386649664491415,
"rewards/format_reward": 0.416666679084301,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 2996.5833740234375,
"epoch": 0.08571428571428572,
"grad_norm": 0.05524285510182381,
"kl": 0.002140045166015625,
"learning_rate": 8.9471999940354e-07,
"loss": 0.1897,
"reward": -0.24548465991392732,
"reward_std": 0.36908531468361616,
"rewards/cosine_scaled_reward": -0.26857565343379974,
"rewards/format_reward": 0.2916666716337204,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 2975.416717529297,
"epoch": 0.08628571428571429,
"grad_norm": 0.02247774600982666,
"kl": 0.0010805130004882812,
"learning_rate": 8.926922383915315e-07,
"loss": 0.028,
"reward": 0.7967187613248825,
"reward_std": 0.7793629765510559,
"rewards/cosine_scaled_reward": 0.12752603506669402,
"rewards/format_reward": 0.541666679084301,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 2992.7500610351562,
"epoch": 0.08685714285714285,
"grad_norm": 0.034905631095170975,
"kl": 0.001766204833984375,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0802,
"reward": -0.17431941628456116,
"reward_std": 0.5932451635599136,
"rewards/cosine_scaled_reward": -0.253826379776001,
"rewards/format_reward": 0.3333333469927311,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2648.375045776367,
"epoch": 0.08742857142857142,
"grad_norm": 0.061414219439029694,
"kl": 0.00140380859375,
"learning_rate": 8.88586709003076e-07,
"loss": 0.2276,
"reward": 0.2311987802386284,
"reward_std": 0.3305536136031151,
"rewards/cosine_scaled_reward": -0.0719006210565567,
"rewards/format_reward": 0.375,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 3579.7083740234375,
"epoch": 0.088,
"grad_norm": 0.027387233451008797,
"kl": 0.002086639404296875,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0025,
"reward": -0.4098093993961811,
"reward_std": 0.39899626886472106,
"rewards/cosine_scaled_reward": -0.22573803085833788,
"rewards/format_reward": 0.0416666679084301,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 3521.0,
"epoch": 0.08857142857142856,
"grad_norm": 0.022302396595478058,
"kl": 0.0013523101806640625,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0376,
"reward": -0.4349953904747963,
"reward_std": 0.45812859386205673,
"rewards/cosine_scaled_reward": -0.23833103850483894,
"rewards/format_reward": 0.0416666679084301,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1962.8333740234375,
"epoch": 0.08914285714285715,
"grad_norm": 0.06173359975218773,
"kl": 0.0016994476318359375,
"learning_rate": 8.823049032816478e-07,
"loss": 0.2643,
"reward": 0.8809916228055954,
"reward_std": 0.7970750592648983,
"rewards/cosine_scaled_reward": 0.1071624793112278,
"rewards/format_reward": 0.6666666865348816,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2750.4583435058594,
"epoch": 0.08971428571428572,
"grad_norm": 0.03586895018815994,
"kl": 0.0015087127685546875,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0192,
"reward": -0.05570299178361893,
"reward_std": 0.3191075511276722,
"rewards/cosine_scaled_reward": -0.25701816380023956,
"rewards/format_reward": 0.4583333432674408,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 3168.0834350585938,
"epoch": 0.09028571428571429,
"grad_norm": 0.0348164327442646,
"kl": 0.0015211105346679688,
"learning_rate": 8.780358823396352e-07,
"loss": 0.1443,
"reward": 0.2681655287742615,
"reward_std": 0.8248983416706324,
"rewards/cosine_scaled_reward": -0.01175057515501976,
"rewards/format_reward": 0.2916666753590107,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 3583.1666870117188,
"epoch": 0.09085714285714286,
"grad_norm": 0.022044459357857704,
"kl": 0.0014705657958984375,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0004,
"reward": 0.07879661209881306,
"reward_std": 0.6083062998950481,
"rewards/cosine_scaled_reward": -0.08560170233249664,
"rewards/format_reward": 0.2500000074505806,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 3510.5000610351562,
"epoch": 0.09142857142857143,
"grad_norm": 0.022759966552257538,
"kl": 0.0013828277587890625,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0369,
"reward": 0.02655605599284172,
"reward_std": 0.5840941201895475,
"rewards/cosine_scaled_reward": -0.09088863991200924,
"rewards/format_reward": 0.2083333358168602,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 3305.2500610351562,
"epoch": 0.092,
"grad_norm": 0.05514242500066757,
"kl": 0.0026226043701171875,
"learning_rate": 8.715127058347614e-07,
"loss": 0.1226,
"reward": -0.20715498179197311,
"reward_std": 0.7715437412261963,
"rewards/cosine_scaled_reward": -0.18691082065925002,
"rewards/format_reward": 0.1666666716337204,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2757.1666870117188,
"epoch": 0.09257142857142857,
"grad_norm": 0.027969077229499817,
"kl": 0.0012922286987304688,
"learning_rate": 8.693068314414344e-07,
"loss": 0.049,
"reward": 0.13557783141732216,
"reward_std": 0.6433919221162796,
"rewards/cosine_scaled_reward": -0.11971108987927437,
"rewards/format_reward": 0.3750000111758709,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 2974.125,
"epoch": 0.09314285714285714,
"grad_norm": 0.0466889925301075,
"kl": 0.0018482208251953125,
"learning_rate": 8.670853944836176e-07,
"loss": 0.1701,
"reward": 0.08318175934255123,
"reward_std": 0.446257796138525,
"rewards/cosine_scaled_reward": -0.06257577612996101,
"rewards/format_reward": 0.2083333432674408,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 3329.791748046875,
"epoch": 0.09371428571428571,
"grad_norm": 0.023540405556559563,
"kl": 0.0011653900146484375,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0473,
"reward": 0.013122133910655975,
"reward_std": 0.7575416192412376,
"rewards/cosine_scaled_reward": -0.1809389404952526,
"rewards/format_reward": 0.3750000111758709,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 3478.625,
"epoch": 0.09428571428571429,
"grad_norm": 0.025917023420333862,
"kl": 0.00133514404296875,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0194,
"reward": -0.3534129783511162,
"reward_std": 0.4404726307839155,
"rewards/cosine_scaled_reward": -0.23920650780200958,
"rewards/format_reward": 0.125,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 3558.375,
"epoch": 0.09485714285714286,
"grad_norm": 0.02446811832487583,
"kl": 0.001556396484375,
"learning_rate": 8.603287946810513e-07,
"loss": 0.015,
"reward": -0.6314733251929283,
"reward_std": 0.49060556665062904,
"rewards/cosine_scaled_reward": -0.33656999468803406,
"rewards/format_reward": 0.0416666679084301,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 2561.1666870117188,
"epoch": 0.09542857142857143,
"grad_norm": 0.03735367953777313,
"kl": 0.0012989044189453125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.1364,
"reward": 1.5804292261600494,
"reward_std": 1.3125610798597336,
"rewards/cosine_scaled_reward": 0.39438124373555183,
"rewards/format_reward": 0.7916666716337204,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 3076.7083740234375,
"epoch": 0.096,
"grad_norm": 0.02906375750899315,
"kl": 0.0014247894287109375,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0478,
"reward": -0.05466272961348295,
"reward_std": 0.5646889731287956,
"rewards/cosine_scaled_reward": -0.1939980387687683,
"rewards/format_reward": 0.3333333432674408,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 2723.7500610351562,
"epoch": 0.09657142857142857,
"grad_norm": 0.033980101346969604,
"kl": 0.0027484893798828125,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0763,
"reward": 0.5076356455683708,
"reward_std": 1.2118781208992004,
"rewards/cosine_scaled_reward": -0.03784886375069618,
"rewards/format_reward": 0.5833333544433117,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 3455.0000610351562,
"epoch": 0.09714285714285714,
"grad_norm": 0.02752372808754444,
"kl": 0.0014181137084960938,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0758,
"reward": -0.042420875281095505,
"reward_std": 0.4691929258406162,
"rewards/cosine_scaled_reward": -0.08371043996885419,
"rewards/format_reward": 0.1250000037252903,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 3003.0833740234375,
"epoch": 0.09771428571428571,
"grad_norm": 0.02626621350646019,
"kl": 0.001392364501953125,
"learning_rate": 8.487667956935087e-07,
"loss": -0.0276,
"reward": 0.34312840178608894,
"reward_std": 0.6185203231871128,
"rewards/cosine_scaled_reward": -0.01593582145869732,
"rewards/format_reward": 0.3750000149011612,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 3214.0000610351562,
"epoch": 0.09828571428571428,
"grad_norm": 0.03126896545290947,
"kl": 0.001842498779296875,
"learning_rate": 8.464102570534061e-07,
"loss": 0.1479,
"reward": -0.4141024649143219,
"reward_std": 0.3650788702070713,
"rewards/cosine_scaled_reward": -0.31121791154146194,
"rewards/format_reward": 0.2083333358168602,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 2655.5416870117188,
"epoch": 0.09885714285714285,
"grad_norm": 0.035498738288879395,
"kl": 0.0016155242919921875,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0215,
"reward": 0.5514224171638489,
"reward_std": 0.8970091938972473,
"rewards/cosine_scaled_reward": -0.015955455601215363,
"rewards/format_reward": 0.5833333432674408,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09942857142857142,
"grad_norm": 0.051081810146570206,
"kl": 0.00257110595703125,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0001,
"reward": -0.6011745631694794,
"reward_std": 0.20032861828804016,
"rewards/cosine_scaled_reward": -0.3005872815847397,
"rewards/format_reward": 0.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 2637.2500610351562,
"epoch": 0.1,
"grad_norm": 0.04597315564751625,
"kl": 0.00162506103515625,
"learning_rate": 8.392544243589427e-07,
"loss": 0.204,
"reward": 0.9529447704553604,
"reward_std": 0.9328476339578629,
"rewards/cosine_scaled_reward": 0.18480567634105682,
"rewards/format_reward": 0.583333358168602,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 2719.4583740234375,
"epoch": 0.10057142857142858,
"grad_norm": 0.033278558403253555,
"kl": 0.0023441314697265625,
"learning_rate": 8.368407953869103e-07,
"loss": -0.0761,
"reward": 0.558716244995594,
"reward_std": 0.9363207742571831,
"rewards/cosine_scaled_reward": 0.050191452726721764,
"rewards/format_reward": 0.4583333544433117,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 3019.7916870117188,
"epoch": 0.10114285714285715,
"grad_norm": 0.02725154533982277,
"kl": 0.001819610595703125,
"learning_rate": 8.344131861991828e-07,
"loss": -0.0076,
"reward": 0.23852095007896423,
"reward_std": 0.4755494873970747,
"rewards/cosine_scaled_reward": -0.047406191006302834,
"rewards/format_reward": 0.3333333358168602,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 3509.9583740234375,
"epoch": 0.10171428571428572,
"grad_norm": 0.020582351833581924,
"kl": 0.0012874603271484375,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0368,
"reward": -0.2527635097503662,
"reward_std": 0.6957648396492004,
"rewards/cosine_scaled_reward": -0.1888817399740219,
"rewards/format_reward": 0.1250000037252903,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 3021.4583435058594,
"epoch": 0.10228571428571429,
"grad_norm": 0.034061066806316376,
"kl": 0.0016498565673828125,
"learning_rate": 8.295165011252396e-07,
"loss": 0.1351,
"reward": -0.2164098173379898,
"reward_std": 0.40133420657366514,
"rewards/cosine_scaled_reward": -0.2332049012184143,
"rewards/format_reward": 0.2500000111758709,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 3016.2916870117188,
"epoch": 0.10285714285714286,
"grad_norm": 0.03417361527681351,
"kl": 0.001708984375,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0431,
"reward": -0.30307525768876076,
"reward_std": 0.41938718408346176,
"rewards/cosine_scaled_reward": -0.297370970249176,
"rewards/format_reward": 0.291666679084301,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 3388.3333740234375,
"epoch": 0.10342857142857143,
"grad_norm": 0.03471015393733978,
"kl": 0.0012874603271484375,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0746,
"reward": 0.09950344264507294,
"reward_std": 0.9219473525881767,
"rewards/cosine_scaled_reward": -0.09608161449432373,
"rewards/format_reward": 0.2916666716337204,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 3492.3333740234375,
"epoch": 0.104,
"grad_norm": 0.028234565630555153,
"kl": 0.0017910003662109375,
"learning_rate": 8.220696016880687e-07,
"loss": 0.038,
"reward": -0.704791434109211,
"reward_std": 0.26428926922380924,
"rewards/cosine_scaled_reward": -0.3940623998641968,
"rewards/format_reward": 0.0833333358168602,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 3276.291748046875,
"epoch": 0.10457142857142857,
"grad_norm": 0.03745553269982338,
"kl": 0.001678466796875,
"learning_rate": 8.195606193320136e-07,
"loss": 0.1006,
"reward": -0.1923907333984971,
"reward_std": 0.5069778934121132,
"rewards/cosine_scaled_reward": -0.26286204531788826,
"rewards/format_reward": 0.3333333432674408,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 2740.2083740234375,
"epoch": 0.10514285714285715,
"grad_norm": 0.03239819407463074,
"kl": 0.0020580291748046875,
"learning_rate": 8.170384989716657e-07,
"loss": -0.0086,
"reward": 0.4912131130695343,
"reward_std": 0.8207836002111435,
"rewards/cosine_scaled_reward": 0.01643986999988556,
"rewards/format_reward": 0.4583333395421505,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 2281.0833435058594,
"epoch": 0.10571428571428572,
"grad_norm": 0.051785144954919815,
"kl": 0.001720428466796875,
"learning_rate": 8.145033635316128e-07,
"loss": 0.168,
"reward": -0.10565903782844543,
"reward_std": 0.4885219484567642,
"rewards/cosine_scaled_reward": -0.3028295412659645,
"rewards/format_reward": 0.5000000111758709,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2814.250045776367,
"epoch": 0.10628571428571429,
"grad_norm": 0.029900984838604927,
"kl": 0.001705169677734375,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0652,
"reward": 0.2904910668730736,
"reward_std": 0.5625524595379829,
"rewards/cosine_scaled_reward": -0.021421127021312714,
"rewards/format_reward": 0.3333333358168602,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 3413.3333740234375,
"epoch": 0.10685714285714286,
"grad_norm": 0.02681022137403488,
"kl": 0.0017871856689453125,
"learning_rate": 8.093945422764069e-07,
"loss": 0.032,
"reward": -0.25666307006031275,
"reward_std": 0.5059951674193144,
"rewards/cosine_scaled_reward": -0.21166487038135529,
"rewards/format_reward": 0.1666666716337204,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 3395.625,
"epoch": 0.10742857142857143,
"grad_norm": 0.02412141114473343,
"kl": 0.0014896392822265625,
"learning_rate": 8.068211054579943e-07,
"loss": -0.0134,
"reward": -0.1184331551194191,
"reward_std": 0.4931886652484536,
"rewards/cosine_scaled_reward": -0.16338326781988144,
"rewards/format_reward": 0.2083333432674408,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 3558.0416870117188,
"epoch": 0.108,
"grad_norm": 0.021045317873358727,
"kl": 0.0013065338134765625,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0148,
"reward": -0.4834739826619625,
"reward_std": 0.5813364610075951,
"rewards/cosine_scaled_reward": -0.3042369857430458,
"rewards/format_reward": 0.1250000037252903,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 3056.125,
"epoch": 0.10857142857142857,
"grad_norm": 0.05889802798628807,
"kl": 0.00144195556640625,
"learning_rate": 8.01636806561836e-07,
"loss": 0.1452,
"reward": -0.37310480140149593,
"reward_std": 0.3222651779651642,
"rewards/cosine_scaled_reward": -0.29071907326579094,
"rewards/format_reward": 0.2083333432674408,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 2909.1666870117188,
"epoch": 0.10914285714285714,
"grad_norm": 0.04346390441060066,
"kl": 0.001735687255859375,
"learning_rate": 7.990261971595048e-07,
"loss": 0.1364,
"reward": -0.19369822181761265,
"reward_std": 0.34732532128691673,
"rewards/cosine_scaled_reward": -0.28434912487864494,
"rewards/format_reward": 0.3750000149011612,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 2861.5,
"epoch": 0.10971428571428571,
"grad_norm": 0.02965477854013443,
"kl": 0.0020046234130859375,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0067,
"reward": 0.1161637231707573,
"reward_std": 0.3917825035750866,
"rewards/cosine_scaled_reward": -0.06691817007958889,
"rewards/format_reward": 0.25,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 3194.9583435058594,
"epoch": 0.11028571428571429,
"grad_norm": 0.046328771859407425,
"kl": 0.001220703125,
"learning_rate": 7.93768694627233e-07,
"loss": 0.1202,
"reward": -0.4120568297803402,
"reward_std": 0.30619435384869576,
"rewards/cosine_scaled_reward": -0.2893617544323206,
"rewards/format_reward": 0.1666666716337204,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 2576.2083740234375,
"epoch": 0.11085714285714286,
"grad_norm": 0.05236797407269478,
"kl": 0.002651214599609375,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0191,
"reward": 0.30105990916490555,
"reward_std": 1.1164244264364243,
"rewards/cosine_scaled_reward": -0.03697003796696663,
"rewards/format_reward": 0.3750000149011612,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 3026.5833740234375,
"epoch": 0.11142857142857143,
"grad_norm": 0.037078604102134705,
"kl": 0.0023555755615234375,
"learning_rate": 7.884636689049422e-07,
"loss": 0.1005,
"reward": -0.11753500252962112,
"reward_std": 0.8442722856998444,
"rewards/cosine_scaled_reward": -0.20460084080696106,
"rewards/format_reward": 0.2916666716337204,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2788.083335876465,
"epoch": 0.112,
"grad_norm": 0.03969040513038635,
"kl": 0.0010595321655273438,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0541,
"reward": 0.30176354944705963,
"reward_std": 0.20443878509104252,
"rewards/cosine_scaled_reward": 0.025881759822368622,
"rewards/format_reward": 0.25,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 3195.7916870117188,
"epoch": 0.11257142857142857,
"grad_norm": 0.029293222352862358,
"kl": 0.0015811920166015625,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0546,
"reward": -0.2110063135623932,
"reward_std": 0.5540565922856331,
"rewards/cosine_scaled_reward": -0.20966985076665878,
"rewards/format_reward": 0.2083333432674408,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 2883.2083740234375,
"epoch": 0.11314285714285714,
"grad_norm": 0.02972813881933689,
"kl": 0.0017833709716796875,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0437,
"reward": 0.38034603744745255,
"reward_std": 0.24535933695733547,
"rewards/cosine_scaled_reward": 0.023506322875618935,
"rewards/format_reward": 0.3333333358168602,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 3448.4583740234375,
"epoch": 0.11371428571428571,
"grad_norm": 0.029532499611377716,
"kl": 0.001827239990234375,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0358,
"reward": 0.006571102887392044,
"reward_std": 0.7588822916150093,
"rewards/cosine_scaled_reward": -0.10088112286757678,
"rewards/format_reward": 0.2083333358168602,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2746.625045776367,
"epoch": 0.11428571428571428,
"grad_norm": 0.0425640270113945,
"kl": 0.00368499755859375,
"learning_rate": 7.75e-07,
"loss": -0.0118,
"reward": -0.09352404624223709,
"reward_std": 0.744966734200716,
"rewards/cosine_scaled_reward": -0.2550953645259142,
"rewards/format_reward": 0.4166666679084301,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 3081.416748046875,
"epoch": 0.11485714285714285,
"grad_norm": 0.031153298914432526,
"kl": 0.0028228759765625,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0011,
"reward": -0.2010077782906592,
"reward_std": 0.4158187806606293,
"rewards/cosine_scaled_reward": -0.22550388798117638,
"rewards/format_reward": 0.25,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 2844.0833435058594,
"epoch": 0.11542857142857142,
"grad_norm": 0.03700404614210129,
"kl": 0.0016689300537109375,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0649,
"reward": 0.2126096710562706,
"reward_std": 0.8212988525629044,
"rewards/cosine_scaled_reward": -0.039528511464595795,
"rewards/format_reward": 0.2916666679084301,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 3052.4166870117188,
"epoch": 0.116,
"grad_norm": 0.05321969836950302,
"kl": 0.001529693603515625,
"learning_rate": 7.667891533457718e-07,
"loss": -0.0751,
"reward": 0.42638406017795205,
"reward_std": 0.7677848218008876,
"rewards/cosine_scaled_reward": 0.004858694272115827,
"rewards/format_reward": 0.4166666716337204,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 2651.3333435058594,
"epoch": 0.11657142857142858,
"grad_norm": 0.035339463502168655,
"kl": 0.001384735107421875,
"learning_rate": 7.640308940816239e-07,
"loss": 0.1109,
"reward": -0.363785021007061,
"reward_std": 0.16049830988049507,
"rewards/cosine_scaled_reward": -0.3485591784119606,
"rewards/format_reward": 0.3333333358168602,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2819.4166870117188,
"epoch": 0.11714285714285715,
"grad_norm": 0.04727662727236748,
"kl": 0.001739501953125,
"learning_rate": 7.612622032536507e-07,
"loss": -0.0962,
"reward": 0.11423163115978241,
"reward_std": 0.7668849229812622,
"rewards/cosine_scaled_reward": -0.13038418628275394,
"rewards/format_reward": 0.375,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 3166.8333740234375,
"epoch": 0.11771428571428572,
"grad_norm": 0.034578051418066025,
"kl": 0.00209808349609375,
"learning_rate": 7.584832158039378e-07,
"loss": -0.0199,
"reward": 0.3246152736246586,
"reward_std": 0.657014474272728,
"rewards/cosine_scaled_reward": -0.046025678515434265,
"rewards/format_reward": 0.4166666716337204,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2621.8333740234375,
"epoch": 0.11828571428571429,
"grad_norm": 0.031992822885513306,
"kl": 0.0015125274658203125,
"learning_rate": 7.556940671764124e-07,
"loss": 0.1341,
"reward": -0.06652424111962318,
"reward_std": 0.44591469690203667,
"rewards/cosine_scaled_reward": -0.2207621317356825,
"rewards/format_reward": 0.375,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 2423.9166870117188,
"epoch": 0.11885714285714286,
"grad_norm": 0.03885209187865257,
"kl": 0.001659393310546875,
"learning_rate": 7.528948933102438e-07,
"loss": 0.2423,
"reward": 0.9343845894327387,
"reward_std": 0.7346701547503471,
"rewards/cosine_scaled_reward": 0.21719225496053696,
"rewards/format_reward": 0.5000000111758709,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2559.416732788086,
"epoch": 0.11942857142857143,
"grad_norm": 0.036148421466350555,
"kl": 0.001346588134765625,
"learning_rate": 7.500858306332172e-07,
"loss": 0.1224,
"reward": 0.5156484246253967,
"reward_std": 0.3985909800976515,
"rewards/cosine_scaled_reward": 0.02865753509104252,
"rewards/format_reward": 0.4583333358168602,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 3318.0000610351562,
"epoch": 0.12,
"grad_norm": 0.03955541551113129,
"kl": 0.0023975372314453125,
"learning_rate": 7.472670160550848e-07,
"loss": 0.1178,
"reward": -0.3450120287016034,
"reward_std": 0.34643174801021814,
"rewards/cosine_scaled_reward": -0.25583934411406517,
"rewards/format_reward": 0.1666666716337204,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 2330.0416870117188,
"epoch": 0.12057142857142857,
"grad_norm": 0.03605935350060463,
"kl": 0.0014362335205078125,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0757,
"reward": 0.46986766904592514,
"reward_std": 0.7498459592461586,
"rewards/cosine_scaled_reward": -0.015066176652908325,
"rewards/format_reward": 0.5000000111758709,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 3491.375,
"epoch": 0.12114285714285715,
"grad_norm": 0.03921404108405113,
"kl": 0.002040863037109375,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0193,
"reward": 0.35056743025779724,
"reward_std": 0.9043543115258217,
"rewards/cosine_scaled_reward": 0.07111704908311367,
"rewards/format_reward": 0.2083333395421505,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 2373.2083740234375,
"epoch": 0.12171428571428572,
"grad_norm": 0.039183299988508224,
"kl": 0.0014171600341796875,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0646,
"reward": 0.5132194012403488,
"reward_std": 0.7909904848784208,
"rewards/cosine_scaled_reward": -0.014223627746105194,
"rewards/format_reward": 0.5416666716337204,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 2606.2916717529297,
"epoch": 0.12228571428571429,
"grad_norm": 0.05271153524518013,
"kl": 0.0015239715576171875,
"learning_rate": 7.358969934210438e-07,
"loss": 0.1925,
"reward": -0.2785708848387003,
"reward_std": 0.2418291885405779,
"rewards/cosine_scaled_reward": -0.30595211312174797,
"rewards/format_reward": 0.3333333358168602,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 3029.1666870117188,
"epoch": 0.12285714285714286,
"grad_norm": 0.0378519743680954,
"kl": 0.0014743804931640625,
"learning_rate": 7.330314893841101e-07,
"loss": 0.1417,
"reward": -0.130330890417099,
"reward_std": 0.6368861794471741,
"rewards/cosine_scaled_reward": -0.2109987661242485,
"rewards/format_reward": 0.291666679084301,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1363.8333587646484,
"epoch": 0.12342857142857143,
"grad_norm": 0.06036696955561638,
"kl": 0.0041713714599609375,
"learning_rate": 7.301570646506027e-07,
"loss": 0.2256,
"reward": 1.9316122829914093,
"reward_std": 0.9315946847200394,
"rewards/cosine_scaled_reward": 0.5074727982282639,
"rewards/format_reward": 0.9166666716337204,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 3511.0833740234375,
"epoch": 0.124,
"grad_norm": 0.023292165249586105,
"kl": 0.0011806488037109375,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0259,
"reward": -0.10942696779966354,
"reward_std": 0.5669385343790054,
"rewards/cosine_scaled_reward": -0.11721348948776722,
"rewards/format_reward": 0.1250000037252903,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2582.3750610351562,
"epoch": 0.12457142857142857,
"grad_norm": 0.036586906760931015,
"kl": 0.00202178955078125,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0355,
"reward": 0.34109579771757126,
"reward_std": 0.6015773788094521,
"rewards/cosine_scaled_reward": -0.10028542950749397,
"rewards/format_reward": 0.5416666865348816,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 2762.1666870117188,
"epoch": 0.12514285714285714,
"grad_norm": 0.07105877995491028,
"kl": 0.0031890869140625,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0505,
"reward": 0.43696425296366215,
"reward_std": 1.1640847735106945,
"rewards/cosine_scaled_reward": -0.0315178744494915,
"rewards/format_reward": 0.5000000037252903,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 2666.541748046875,
"epoch": 0.12571428571428572,
"grad_norm": 0.028472868725657463,
"kl": 0.0014820098876953125,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0372,
"reward": 1.2517385482788086,
"reward_std": 0.9800175577402115,
"rewards/cosine_scaled_reward": 0.2508692592382431,
"rewards/format_reward": 0.7500000223517418,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1970.4166870117188,
"epoch": 0.12628571428571428,
"grad_norm": 0.06328482180833817,
"kl": 0.005306243896484375,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0612,
"reward": 0.6105676591396332,
"reward_std": 0.7854063790291548,
"rewards/cosine_scaled_reward": -0.0488828644156456,
"rewards/format_reward": 0.7083333432674408,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1992.2500228881836,
"epoch": 0.12685714285714286,
"grad_norm": 0.06610880047082901,
"kl": 0.0022563934326171875,
"learning_rate": 7.127310565369415e-07,
"loss": 0.1555,
"reward": 0.6662397608160973,
"reward_std": 0.8406016491353512,
"rewards/cosine_scaled_reward": 0.04145320225507021,
"rewards/format_reward": 0.5833333432674408,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 2984.9583435058594,
"epoch": 0.12742857142857142,
"grad_norm": 0.03647996112704277,
"kl": 0.00249481201171875,
"learning_rate": 7.097981330836616e-07,
"loss": 0.1652,
"reward": -0.178400207310915,
"reward_std": 0.4905394911766052,
"rewards/cosine_scaled_reward": -0.21420010179281235,
"rewards/format_reward": 0.2500000111758709,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 3452.5,
"epoch": 0.128,
"grad_norm": 0.027256779372692108,
"kl": 0.0012521743774414062,
"learning_rate": 7.068574212948169e-07,
"loss": 0.026,
"reward": 0.24938065744936466,
"reward_std": 0.7312147244811058,
"rewards/cosine_scaled_reward": 0.04135699989274144,
"rewards/format_reward": 0.1666666716337204,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 3069.541717529297,
"epoch": 0.12857142857142856,
"grad_norm": 0.07334668934345245,
"kl": 0.002166748046875,
"learning_rate": 7.039090644965509e-07,
"loss": 0.1767,
"reward": -0.0735011026263237,
"reward_std": 0.7092219665646553,
"rewards/cosine_scaled_reward": -0.14091723039746284,
"rewards/format_reward": 0.2083333395421505,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 2290.875,
"epoch": 0.12914285714285714,
"grad_norm": 0.04525444656610489,
"kl": 0.0022106170654296875,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0869,
"reward": 0.668890580534935,
"reward_std": 0.3278286010026932,
"rewards/cosine_scaled_reward": 0.10527862049639225,
"rewards/format_reward": 0.4583333432674408,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2456.500045776367,
"epoch": 0.12971428571428573,
"grad_norm": 0.03140163794159889,
"kl": 0.0014286041259765625,
"learning_rate": 6.979899910323624e-07,
"loss": 0.1135,
"reward": 0.918444074690342,
"reward_std": 0.6770734786987305,
"rewards/cosine_scaled_reward": 0.14672203361988068,
"rewards/format_reward": 0.6250000149011612,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1535.0416793823242,
"epoch": 0.13028571428571428,
"grad_norm": 0.08130006492137909,
"kl": 0.001720428466796875,
"learning_rate": 6.950195628537299e-07,
"loss": 0.1626,
"reward": 1.1805724427103996,
"reward_std": 0.5400062706321478,
"rewards/cosine_scaled_reward": 0.23611955903470516,
"rewards/format_reward": 0.7083333432674408,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 2394.0833740234375,
"epoch": 0.13085714285714287,
"grad_norm": 0.026739483699202538,
"kl": 0.0032806396484375,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0862,
"reward": 0.2099156752228737,
"reward_std": 0.8997384756803513,
"rewards/cosine_scaled_reward": -0.16587551310658455,
"rewards/format_reward": 0.5416666716337204,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 3482.416748046875,
"epoch": 0.13142857142857142,
"grad_norm": 0.03371553122997284,
"kl": 0.0015506744384765625,
"learning_rate": 6.890576474687263e-07,
"loss": 0.042,
"reward": 0.1788851134479046,
"reward_std": 1.0173771809786558,
"rewards/cosine_scaled_reward": -0.035557448863983154,
"rewards/format_reward": 0.2500000037252903,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 2954.9166870117188,
"epoch": 0.132,
"grad_norm": 0.04845058172941208,
"kl": 0.0019893646240234375,
"learning_rate": 6.860664508377001e-07,
"loss": -0.0419,
"reward": 0.22116372920572758,
"reward_std": 0.6237488426268101,
"rewards/cosine_scaled_reward": -0.035251480527222157,
"rewards/format_reward": 0.2916666679084301,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2848.3333435058594,
"epoch": 0.13257142857142856,
"grad_norm": 0.034000083804130554,
"kl": 0.0017070770263671875,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0386,
"reward": -0.31070365011692047,
"reward_std": 0.42055003717541695,
"rewards/cosine_scaled_reward": -0.28035183250904083,
"rewards/format_reward": 0.25,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 3568.7916870117188,
"epoch": 0.13314285714285715,
"grad_norm": 0.020154103636741638,
"kl": 0.0011072158813476562,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0088,
"reward": -0.326989121735096,
"reward_std": 0.28436263278126717,
"rewards/cosine_scaled_reward": -0.18432788737118244,
"rewards/format_reward": 0.0416666679084301,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 3366.5833740234375,
"epoch": 0.1337142857142857,
"grad_norm": 0.02581801265478134,
"kl": 0.0016918182373046875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0887,
"reward": 0.00817985087633133,
"reward_std": 0.7927666939795017,
"rewards/cosine_scaled_reward": -0.07924340665340424,
"rewards/format_reward": 0.1666666679084301,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 2722.500030517578,
"epoch": 0.13428571428571429,
"grad_norm": 0.031329214572906494,
"kl": 0.0017242431640625,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0564,
"reward": 0.3927113115787506,
"reward_std": 0.3258180283010006,
"rewards/cosine_scaled_reward": 0.07135568559169769,
"rewards/format_reward": 0.2500000111758709,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 2103.791748046875,
"epoch": 0.13485714285714287,
"grad_norm": 0.10697239637374878,
"kl": 0.0028324127197265625,
"learning_rate": 6.710139192768694e-07,
"loss": -0.2253,
"reward": 0.6361608393490314,
"reward_std": 0.7396758496761322,
"rewards/cosine_scaled_reward": -0.11941959708929062,
"rewards/format_reward": 0.8750000149011612,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 2060.3333435058594,
"epoch": 0.13542857142857143,
"grad_norm": 0.14326247572898865,
"kl": 0.00321197509765625,
"learning_rate": 6.679851303883891e-07,
"loss": 0.5268,
"reward": 0.07850592583417892,
"reward_std": 0.3517175354063511,
"rewards/cosine_scaled_reward": -0.23158037662506104,
"rewards/format_reward": 0.5416666865348816,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 2588.041717529297,
"epoch": 0.136,
"grad_norm": 0.029134077951312065,
"kl": 0.001407623291015625,
"learning_rate": 6.649505910711058e-07,
"loss": 0.1147,
"reward": 0.03815904259681702,
"reward_std": 0.9421084597706795,
"rewards/cosine_scaled_reward": -0.18925382010638714,
"rewards/format_reward": 0.4166666679084301,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 3495.2916870117188,
"epoch": 0.13657142857142857,
"grad_norm": 0.032422687858343124,
"kl": 0.0017642974853515625,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0385,
"reward": -0.0038099363446235657,
"reward_std": 0.6519506089389324,
"rewards/cosine_scaled_reward": -0.10607165098190308,
"rewards/format_reward": 0.2083333395421505,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 3191.2916870117188,
"epoch": 0.13714285714285715,
"grad_norm": 0.03834246098995209,
"kl": 0.0018939971923828125,
"learning_rate": 6.588648530198504e-07,
"loss": 0.1727,
"reward": -0.5556656457483768,
"reward_std": 0.33576977998018265,
"rewards/cosine_scaled_reward": -0.3611661493778229,
"rewards/format_reward": 0.1666666716337204,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2791.1666870117188,
"epoch": 0.1377142857142857,
"grad_norm": 0.035378143191337585,
"kl": 0.0014400482177734375,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0347,
"reward": 0.4084257259964943,
"reward_std": 0.48210613802075386,
"rewards/cosine_scaled_reward": 0.05837950482964516,
"rewards/format_reward": 0.2916666679084301,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 3273.541748046875,
"epoch": 0.1382857142857143,
"grad_norm": 0.031054208055138588,
"kl": 0.0017986297607421875,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0816,
"reward": 0.08138477802276611,
"reward_std": 0.507510906085372,
"rewards/cosine_scaled_reward": -0.0843076054006815,
"rewards/format_reward": 0.2500000074505806,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 3058.3333435058594,
"epoch": 0.13885714285714285,
"grad_norm": 0.051523368805646896,
"kl": 0.002166748046875,
"learning_rate": 6.496968239287603e-07,
"loss": -0.047,
"reward": -0.3038304392248392,
"reward_std": 0.30531714484095573,
"rewards/cosine_scaled_reward": -0.2560819098725915,
"rewards/format_reward": 0.2083333432674408,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 3096.8751220703125,
"epoch": 0.13942857142857143,
"grad_norm": 0.0450126975774765,
"kl": 0.0025043487548828125,
"learning_rate": 6.466308972251785e-07,
"loss": 0.1393,
"reward": -0.2884998172521591,
"reward_std": 0.7407731115818024,
"rewards/cosine_scaled_reward": -0.29008324444293976,
"rewards/format_reward": 0.2916666753590107,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 2948.3334350585938,
"epoch": 0.14,
"grad_norm": 0.037310581654310226,
"kl": 0.0023345947265625,
"learning_rate": 6.435602608679916e-07,
"loss": 0.1311,
"reward": -0.0698681827634573,
"reward_std": 0.7073140665888786,
"rewards/cosine_scaled_reward": -0.22243409883230925,
"rewards/format_reward": 0.3750000149011612,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 2646.7916717529297,
"epoch": 0.14057142857142857,
"grad_norm": 0.03474028408527374,
"kl": 0.0044040679931640625,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0024,
"reward": 0.28572891652584076,
"reward_std": 0.5858916472643614,
"rewards/cosine_scaled_reward": -0.08630221337080002,
"rewards/format_reward": 0.4583333432674408,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 3523.5833740234375,
"epoch": 0.14114285714285715,
"grad_norm": 0.025148576125502586,
"kl": 0.0015659332275390625,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0173,
"reward": 0.3484674394130707,
"reward_std": 0.9789317026734352,
"rewards/cosine_scaled_reward": 0.04923371225595474,
"rewards/format_reward": 0.25,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1775.2083435058594,
"epoch": 0.1417142857142857,
"grad_norm": 0.1530074179172516,
"kl": 0.0032968521118164062,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0899,
"reward": 0.8861258625984192,
"reward_std": 0.7357020750641823,
"rewards/cosine_scaled_reward": 0.0680629163980484,
"rewards/format_reward": 0.75,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 3250.7916870117188,
"epoch": 0.1422857142857143,
"grad_norm": 0.02402082085609436,
"kl": 0.0017642974853515625,
"learning_rate": 6.31233615362752e-07,
"loss": -0.0621,
"reward": -0.30002081394195557,
"reward_std": 0.27393733337521553,
"rewards/cosine_scaled_reward": -0.275010421872139,
"rewards/format_reward": 0.25,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 2827.250045776367,
"epoch": 0.14285714285714285,
"grad_norm": 0.02889254502952099,
"kl": 0.001651763916015625,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0829,
"reward": 0.19747568666934967,
"reward_std": 0.5161016583442688,
"rewards/cosine_scaled_reward": -0.047095492482185364,
"rewards/format_reward": 0.2916666679084301,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 2044.4583740234375,
"epoch": 0.14342857142857143,
"grad_norm": 0.03344491869211197,
"kl": 0.0017852783203125,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0994,
"reward": 0.5866867303848267,
"reward_std": 0.42943302541971207,
"rewards/cosine_scaled_reward": -0.06082333158701658,
"rewards/format_reward": 0.7083333432674408,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 3359.625,
"epoch": 0.144,
"grad_norm": 0.03195515275001526,
"kl": 0.00228118896484375,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0971,
"reward": -0.43340764939785004,
"reward_std": 0.4943426577374339,
"rewards/cosine_scaled_reward": -0.27920382656157017,
"rewards/format_reward": 0.1250000037252903,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 2978.6666870117188,
"epoch": 0.14457142857142857,
"grad_norm": 0.04462401941418648,
"kl": 0.0021305084228515625,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0225,
"reward": -0.260135967284441,
"reward_std": 0.3536509699188173,
"rewards/cosine_scaled_reward": -0.25506797432899475,
"rewards/format_reward": 0.25,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 2833.041717529297,
"epoch": 0.14514285714285713,
"grad_norm": 0.024158809334039688,
"kl": 0.0010128021240234375,
"learning_rate": 6.157373628530852e-07,
"loss": 0.1095,
"reward": 0.21884476393461227,
"reward_std": 0.5512615516781807,
"rewards/cosine_scaled_reward": -0.07807762175798416,
"rewards/format_reward": 0.3750000037252903,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 2854.5416870117188,
"epoch": 0.1457142857142857,
"grad_norm": 0.02831795997917652,
"kl": 0.001422882080078125,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0172,
"reward": 0.48354003578424454,
"reward_std": 0.7642313642427325,
"rewards/cosine_scaled_reward": 0.05427001416683197,
"rewards/format_reward": 0.375,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 3583.375,
"epoch": 0.1462857142857143,
"grad_norm": 0.021453462541103363,
"kl": 0.0015506744384765625,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0004,
"reward": -0.642357274889946,
"reward_std": 0.2051343321800232,
"rewards/cosine_scaled_reward": -0.3420119658112526,
"rewards/format_reward": 0.0416666679084301,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 2608.875030517578,
"epoch": 0.14685714285714285,
"grad_norm": 0.04477157071232796,
"kl": 0.00202178955078125,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0655,
"reward": -0.05212653428316116,
"reward_std": 0.5651835091412067,
"rewards/cosine_scaled_reward": -0.2760632745921612,
"rewards/format_reward": 0.5000000074505806,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 2090.875,
"epoch": 0.14742857142857144,
"grad_norm": 0.036602683365345,
"kl": 0.0011730194091796875,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0121,
"reward": 0.16203255951404572,
"reward_std": 0.25791847333312035,
"rewards/cosine_scaled_reward": -0.16898373514413834,
"rewards/format_reward": 0.5,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.148,
"grad_norm": 0.03482256084680557,
"kl": 0.001644134521484375,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0001,
"reward": -0.5393809229135513,
"reward_std": 0.27719491347670555,
"rewards/cosine_scaled_reward": -0.26969046890735626,
"rewards/format_reward": 0.0,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 3430.8333740234375,
"epoch": 0.14857142857142858,
"grad_norm": 0.024355540052056313,
"kl": 0.0018463134765625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0305,
"reward": 0.07202602922916412,
"reward_std": 0.5382623486220837,
"rewards/cosine_scaled_reward": -0.04732033051550388,
"rewards/format_reward": 0.1666666716337204,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 3494.5833740234375,
"epoch": 0.14914285714285713,
"grad_norm": 0.030503125861287117,
"kl": 0.0013980865478515625,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0401,
"reward": -0.5600034184753895,
"reward_std": 0.17206953093409538,
"rewards/cosine_scaled_reward": -0.30083504132926464,
"rewards/format_reward": 0.0416666679084301,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 3101.3334350585938,
"epoch": 0.14971428571428572,
"grad_norm": 0.031000809744000435,
"kl": 0.001499176025390625,
"learning_rate": 5.907846610890011e-07,
"loss": 0.1274,
"reward": 0.46077074110507965,
"reward_std": 0.9705281481146812,
"rewards/cosine_scaled_reward": 0.0012187082320451736,
"rewards/format_reward": 0.4583333469927311,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 2895.6250610351562,
"epoch": 0.15028571428571427,
"grad_norm": 0.06233460456132889,
"kl": 0.0022935867309570312,
"learning_rate": 5.87655029499542e-07,
"loss": 0.1454,
"reward": 0.12187894992530346,
"reward_std": 0.4917758535593748,
"rewards/cosine_scaled_reward": -0.10572719946503639,
"rewards/format_reward": 0.3333333432674408,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 3521.5833740234375,
"epoch": 0.15085714285714286,
"grad_norm": 0.023180929943919182,
"kl": 0.001529693603515625,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0256,
"reward": -0.47894274443387985,
"reward_std": 0.19030883349478245,
"rewards/cosine_scaled_reward": -0.2603047080338001,
"rewards/format_reward": 0.0416666679084301,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 3414.5833740234375,
"epoch": 0.15142857142857144,
"grad_norm": 0.0296033825725317,
"kl": 0.002376556396484375,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0306,
"reward": -0.5536337494850159,
"reward_std": 0.22310560196638107,
"rewards/cosine_scaled_reward": -0.29765018820762634,
"rewards/format_reward": 0.0416666679084301,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 2804.5,
"epoch": 0.152,
"grad_norm": 0.05227090045809746,
"kl": 0.0020275115966796875,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0961,
"reward": -0.1058163233101368,
"reward_std": 0.2853359058499336,
"rewards/cosine_scaled_reward": -0.2820748332887888,
"rewards/format_reward": 0.4583333432674408,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 1814.6250305175781,
"epoch": 0.15257142857142858,
"grad_norm": 0.04941815882921219,
"kl": 0.0023040771484375,
"learning_rate": 5.751196772469237e-07,
"loss": 0.1364,
"reward": 0.15700630843639374,
"reward_std": 0.5053564310073853,
"rewards/cosine_scaled_reward": -0.2548302114009857,
"rewards/format_reward": 0.6666666716337204,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 2904.291748046875,
"epoch": 0.15314285714285714,
"grad_norm": 0.0729227364063263,
"kl": 0.0023136138916015625,
"learning_rate": 5.71982396408026e-07,
"loss": 0.1989,
"reward": 0.28148211538791656,
"reward_std": 0.7497629150748253,
"rewards/cosine_scaled_reward": -0.06759228184819221,
"rewards/format_reward": 0.4166666828095913,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 3234.8750610351562,
"epoch": 0.15371428571428572,
"grad_norm": 0.036167096346616745,
"kl": 0.0014944076538085938,
"learning_rate": 5.688440441781398e-07,
"loss": 0.1373,
"reward": -0.1977532785385847,
"reward_std": 0.5611210819333792,
"rewards/cosine_scaled_reward": -0.1822099736891687,
"rewards/format_reward": 0.1666666679084301,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 2171.8333435058594,
"epoch": 0.15428571428571428,
"grad_norm": 0.051499780267477036,
"kl": 0.0028324127197265625,
"learning_rate": 5.657047735161255e-07,
"loss": 0.1384,
"reward": 0.7494456171989441,
"reward_std": 0.3765489496290684,
"rewards/cosine_scaled_reward": 0.12472283840179443,
"rewards/format_reward": 0.5,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 2817.0416870117188,
"epoch": 0.15485714285714286,
"grad_norm": 0.05161137878894806,
"kl": 0.002685546875,
"learning_rate": 5.625647374256061e-07,
"loss": 0.185,
"reward": 0.15184487029910088,
"reward_std": 0.7885057218372822,
"rewards/cosine_scaled_reward": -0.15324425138533115,
"rewards/format_reward": 0.4583333469927311,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2612.291748046875,
"epoch": 0.15542857142857142,
"grad_norm": 0.03608538582921028,
"kl": 0.0018749237060546875,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0447,
"reward": -0.1857751179486513,
"reward_std": 0.6941423173993826,
"rewards/cosine_scaled_reward": -0.2803875617682934,
"rewards/format_reward": 0.3750000037252903,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 3204.666748046875,
"epoch": 0.156,
"grad_norm": 0.04544828087091446,
"kl": 0.00322723388671875,
"learning_rate": 5.562829811526154e-07,
"loss": 0.1119,
"reward": -0.2601732239127159,
"reward_std": 0.7017681710422039,
"rewards/cosine_scaled_reward": -0.2550866212695837,
"rewards/format_reward": 0.25,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.15657142857142858,
"grad_norm": 0.02732660062611103,
"kl": 0.00152587890625,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0001,
"reward": -0.5715373530983925,
"reward_std": 0.16449665278196335,
"rewards/cosine_scaled_reward": -0.28576867654919624,
"rewards/format_reward": 0.0,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 2266.2916870117188,
"epoch": 0.15714285714285714,
"grad_norm": 0.04678889736533165,
"kl": 0.002040863037109375,
"learning_rate": 5.5e-07,
"loss": 0.1111,
"reward": -0.02398175746202469,
"reward_std": 0.5438921824097633,
"rewards/cosine_scaled_reward": -0.26199088245630264,
"rewards/format_reward": 0.5000000111758709,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 2080.2916717529297,
"epoch": 0.15771428571428572,
"grad_norm": 0.04423975571990013,
"kl": 0.001983642578125,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0326,
"reward": 0.8940422385931015,
"reward_std": 0.6977650858461857,
"rewards/cosine_scaled_reward": 0.1553544420748949,
"rewards/format_reward": 0.5833333358168602,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 3313.4166870117188,
"epoch": 0.15828571428571428,
"grad_norm": 0.033737484365701675,
"kl": 0.001659393310546875,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0955,
"reward": 0.24843797460198402,
"reward_std": 0.8951086550951004,
"rewards/cosine_scaled_reward": -0.042447689920663834,
"rewards/format_reward": 0.3333333395421505,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 2685.4583740234375,
"epoch": 0.15885714285714286,
"grad_norm": 0.036528535187244415,
"kl": 0.001453399658203125,
"learning_rate": 5.405759110524894e-07,
"loss": -0.1292,
"reward": 0.11107654124498367,
"reward_std": 0.6049798280000687,
"rewards/cosine_scaled_reward": -0.17362840473651886,
"rewards/format_reward": 0.4583333432674408,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 2706.000045776367,
"epoch": 0.15942857142857142,
"grad_norm": 0.03959155082702637,
"kl": 0.002044677734375,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0108,
"reward": 0.2249528057873249,
"reward_std": 0.6208834704011679,
"rewards/cosine_scaled_reward": -0.1375236064195633,
"rewards/format_reward": 0.5000000074505806,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 2622.7083740234375,
"epoch": 0.16,
"grad_norm": 0.03432968258857727,
"kl": 0.0015316009521484375,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0388,
"reward": 0.3257916159927845,
"reward_std": 0.5739619396626949,
"rewards/cosine_scaled_reward": -0.0871041975915432,
"rewards/format_reward": 0.5000000111758709,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 2522.4583435058594,
"epoch": 0.16057142857142856,
"grad_norm": 0.1340794712305069,
"kl": 0.0046539306640625,
"learning_rate": 5.311559558218603e-07,
"loss": 0.3254,
"reward": 0.24769596755504608,
"reward_std": 0.6493666023015976,
"rewards/cosine_scaled_reward": -0.06365201622247696,
"rewards/format_reward": 0.3750000149011612,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 3512.25,
"epoch": 0.16114285714285714,
"grad_norm": 0.036346111446619034,
"kl": 0.002231597900390625,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0214,
"reward": -0.08259931951761246,
"reward_std": 0.5185085125267506,
"rewards/cosine_scaled_reward": -0.10379965975880623,
"rewards/format_reward": 0.125,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2381.2083435058594,
"epoch": 0.16171428571428573,
"grad_norm": 0.04855608195066452,
"kl": 0.002384185791015625,
"learning_rate": 5.248803227530763e-07,
"loss": 0.2503,
"reward": -0.16534588485956192,
"reward_std": 0.4023447409272194,
"rewards/cosine_scaled_reward": -0.29100628197193146,
"rewards/format_reward": 0.4166666716337204,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2702.9166717529297,
"epoch": 0.16228571428571428,
"grad_norm": 0.06319528818130493,
"kl": 0.0033445358276367188,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0455,
"reward": 0.9203528836369514,
"reward_std": 0.6037855893373489,
"rewards/cosine_scaled_reward": 0.23100974038243294,
"rewards/format_reward": 0.4583333432674408,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 3470.625,
"epoch": 0.16285714285714287,
"grad_norm": 0.024533158168196678,
"kl": 0.002277374267578125,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0548,
"reward": -0.4412531442940235,
"reward_std": 0.2475447803735733,
"rewards/cosine_scaled_reward": -0.24145990796387196,
"rewards/format_reward": 0.0416666679084301,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 3249.5000610351562,
"epoch": 0.16342857142857142,
"grad_norm": 0.023537656292319298,
"kl": 0.001232147216796875,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0495,
"reward": 0.6038705855607986,
"reward_std": 1.2430939674377441,
"rewards/cosine_scaled_reward": 0.11443530488759279,
"rewards/format_reward": 0.375,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 2912.2084045410156,
"epoch": 0.164,
"grad_norm": 0.02962152287364006,
"kl": 0.002147674560546875,
"learning_rate": 5.123449705004581e-07,
"loss": -0.0385,
"reward": 0.1606165710836649,
"reward_std": 1.0313879009336233,
"rewards/cosine_scaled_reward": -0.12802506377920508,
"rewards/format_reward": 0.4166666679084301,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 3232.0833740234375,
"epoch": 0.16457142857142856,
"grad_norm": 0.052544910460710526,
"kl": 0.002452850341796875,
"learning_rate": 5.09215338910999e-07,
"loss": 0.1251,
"reward": -0.2673683166503906,
"reward_std": 0.5355051569640636,
"rewards/cosine_scaled_reward": -0.2795174904167652,
"rewards/format_reward": 0.291666679084301,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 2954.625030517578,
"epoch": 0.16514285714285715,
"grad_norm": 0.11674802005290985,
"kl": 0.0030517578125,
"learning_rate": 5.060876951083828e-07,
"loss": 0.2068,
"reward": -0.09129756689071655,
"reward_std": 0.5470742993056774,
"rewards/cosine_scaled_reward": -0.19148211926221848,
"rewards/format_reward": 0.291666679084301,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 2834.2083740234375,
"epoch": 0.1657142857142857,
"grad_norm": 0.04204864799976349,
"kl": 0.003559112548828125,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0704,
"reward": 0.04473920911550522,
"reward_std": 0.7532506696879864,
"rewards/cosine_scaled_reward": -0.20679706521332264,
"rewards/format_reward": 0.4583333432674408,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1662857142857143,
"grad_norm": 0.026785722002387047,
"kl": 0.00194549560546875,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0001,
"reward": -0.4050272926688194,
"reward_std": 0.22680751886218786,
"rewards/cosine_scaled_reward": -0.26501364447176456,
"rewards/format_reward": 0.125,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 2743.9583740234375,
"epoch": 0.16685714285714287,
"grad_norm": 0.04605742171406746,
"kl": 0.002735137939453125,
"learning_rate": 4.967182142620745e-07,
"loss": 0.1363,
"reward": -0.17294890712946653,
"reward_std": 0.7744690459221601,
"rewards/cosine_scaled_reward": -0.2948077991604805,
"rewards/format_reward": 0.416666679084301,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 2113.5833435058594,
"epoch": 0.16742857142857143,
"grad_norm": 0.06665001809597015,
"kl": 0.002529144287109375,
"learning_rate": 4.93600044896063e-07,
"loss": -0.1619,
"reward": 1.1441311240196228,
"reward_std": 0.7703273370862007,
"rewards/cosine_scaled_reward": 0.197065532207489,
"rewards/format_reward": 0.75,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 3352.125,
"epoch": 0.168,
"grad_norm": 0.03626595437526703,
"kl": 0.0022869110107421875,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0957,
"reward": -0.5403950586915016,
"reward_std": 0.4801517631858587,
"rewards/cosine_scaled_reward": -0.3118642121553421,
"rewards/format_reward": 0.0833333358168602,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 3068.2918090820312,
"epoch": 0.16857142857142857,
"grad_norm": 0.06331875920295715,
"kl": 0.0023403167724609375,
"learning_rate": 4.873721045679706e-07,
"loss": 0.1906,
"reward": -0.1992212040349841,
"reward_std": 0.335015382617712,
"rewards/cosine_scaled_reward": -0.2454439401626587,
"rewards/format_reward": 0.2916666716337204,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 3276.875,
"epoch": 0.16914285714285715,
"grad_norm": 0.03193613886833191,
"kl": 0.0019054412841796875,
"learning_rate": 4.842626371469149e-07,
"loss": 0.1161,
"reward": -0.44363995268940926,
"reward_std": 0.5190198123455048,
"rewards/cosine_scaled_reward": -0.30515330098569393,
"rewards/format_reward": 0.1666666679084301,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 3095.416748046875,
"epoch": 0.1697142857142857,
"grad_norm": 0.03605992719531059,
"kl": 0.0017986297607421875,
"learning_rate": 4.811563736721829e-07,
"loss": 0.1403,
"reward": 0.3412788724526763,
"reward_std": 1.0304713770747185,
"rewards/cosine_scaled_reward": -0.016860555857419968,
"rewards/format_reward": 0.3750000074505806,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 2780.9584197998047,
"epoch": 0.1702857142857143,
"grad_norm": 0.03472289815545082,
"kl": 0.0021076202392578125,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0383,
"reward": 0.31697067245841026,
"reward_std": 0.8645676001906395,
"rewards/cosine_scaled_reward": -0.02901467215269804,
"rewards/format_reward": 0.3750000037252903,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 3380.0416870117188,
"epoch": 0.17085714285714285,
"grad_norm": 0.036116454750299454,
"kl": 0.00353240966796875,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0643,
"reward": -0.28232631646096706,
"reward_std": 0.3243029280565679,
"rewards/cosine_scaled_reward": -0.22449648473411798,
"rewards/format_reward": 0.1666666679084301,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 3032.8333435058594,
"epoch": 0.17142857142857143,
"grad_norm": 0.05061323195695877,
"kl": 0.002704620361328125,
"learning_rate": 4.7185832004988133e-07,
"loss": -0.1324,
"reward": -0.30845024436712265,
"reward_std": 0.28765837103128433,
"rewards/cosine_scaled_reward": -0.3000584542751312,
"rewards/format_reward": 0.2916666679084301,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 2668.166717529297,
"epoch": 0.172,
"grad_norm": 0.05065063014626503,
"kl": 0.0015020370483398438,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0702,
"reward": -0.1258373148739338,
"reward_std": 0.28961411863565445,
"rewards/cosine_scaled_reward": -0.25041866675019264,
"rewards/format_reward": 0.375,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 2789.125045776367,
"epoch": 0.17257142857142857,
"grad_norm": 0.038695093244314194,
"kl": 0.0020580291748046875,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0482,
"reward": 0.2734690923243761,
"reward_std": 0.3800287460908294,
"rewards/cosine_scaled_reward": -0.009098782204091549,
"rewards/format_reward": 0.2916666679084301,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 2981.4583740234375,
"epoch": 0.17314285714285715,
"grad_norm": 0.027125045657157898,
"kl": 0.00201416015625,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0003,
"reward": 0.03397008776664734,
"reward_std": 0.25730124674737453,
"rewards/cosine_scaled_reward": -0.12884829938411713,
"rewards/format_reward": 0.2916666679084301,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 3580.5833740234375,
"epoch": 0.1737142857142857,
"grad_norm": 0.023192252963781357,
"kl": 0.0017671585083007812,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0008,
"reward": -0.3860953450202942,
"reward_std": 0.25841284543275833,
"rewards/cosine_scaled_reward": -0.2347143404185772,
"rewards/format_reward": 0.0833333358168602,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 3224.2500610351562,
"epoch": 0.1742857142857143,
"grad_norm": 0.02765670232474804,
"kl": 0.0017538070678710938,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0862,
"reward": 0.176173347979784,
"reward_std": 0.9991654083132744,
"rewards/cosine_scaled_reward": -0.07857999578118324,
"rewards/format_reward": 0.3333333432674408,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 2369.5416870117188,
"epoch": 0.17485714285714285,
"grad_norm": 0.04539618641138077,
"kl": 0.001949310302734375,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.1212,
"reward": 0.29690991109237075,
"reward_std": 0.47233542799949646,
"rewards/cosine_scaled_reward": -0.080711729824543,
"rewards/format_reward": 0.4583333432674408,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.17542857142857143,
"grad_norm": 0.02914069965481758,
"kl": 0.0013408660888671875,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0001,
"reward": -0.7341588288545609,
"reward_std": 0.3520149402320385,
"rewards/cosine_scaled_reward": -0.3670794293284416,
"rewards/format_reward": 0.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 2396.875030517578,
"epoch": 0.176,
"grad_norm": 0.07532023638486862,
"kl": 0.003215789794921875,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.144,
"reward": 0.7676626294851303,
"reward_std": 0.7929953560233116,
"rewards/cosine_scaled_reward": 0.13383129611611366,
"rewards/format_reward": 0.5000000111758709,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.17657142857142857,
"grad_norm": 0.02417682111263275,
"kl": 0.0022430419921875,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0001,
"reward": -0.3408763660117984,
"reward_std": 0.412366415373981,
"rewards/cosine_scaled_reward": -0.19127151230350137,
"rewards/format_reward": 0.0416666679084301,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 3580.75,
"epoch": 0.17714285714285713,
"grad_norm": 0.029466478154063225,
"kl": 0.0016956329345703125,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0018,
"reward": -0.5854391157627106,
"reward_std": 0.18960697948932648,
"rewards/cosine_scaled_reward": -0.3135529085993767,
"rewards/format_reward": 0.0416666679084301,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 2888.5416717529297,
"epoch": 0.1777142857142857,
"grad_norm": 0.02941196970641613,
"kl": 0.00191497802734375,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0115,
"reward": -0.007554952055215836,
"reward_std": 0.3427964933216572,
"rewards/cosine_scaled_reward": -0.1287774909287691,
"rewards/format_reward": 0.25,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 3370.7083740234375,
"epoch": 0.1782857142857143,
"grad_norm": 0.042647719383239746,
"kl": 0.00409698486328125,
"learning_rate": 4.350494089288943e-07,
"loss": 0.115,
"reward": -0.3963359580375254,
"reward_std": 0.22513410821557045,
"rewards/cosine_scaled_reward": -0.23983464762568474,
"rewards/format_reward": 0.0833333358168602,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 2669.6666717529297,
"epoch": 0.17885714285714285,
"grad_norm": 0.044029586017131805,
"kl": 0.0015001296997070312,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0538,
"reward": 0.3241320550441742,
"reward_std": 0.871776606887579,
"rewards/cosine_scaled_reward": -0.04626730363816023,
"rewards/format_reward": 0.4166666679084301,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 2831.9583740234375,
"epoch": 0.17942857142857144,
"grad_norm": 0.031612906605005264,
"kl": 0.001766204833984375,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.2969,
"reward": 0.529218353331089,
"reward_std": 1.1238218173384666,
"rewards/cosine_scaled_reward": 0.0771091878414154,
"rewards/format_reward": 0.3750000149011612,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 3057.4583435058594,
"epoch": 0.18,
"grad_norm": 0.04660829156637192,
"kl": 0.002025604248046875,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.1379,
"reward": -0.42779871821403503,
"reward_std": 0.3359145261347294,
"rewards/cosine_scaled_reward": -0.3388993591070175,
"rewards/format_reward": 0.2500000111758709,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.18057142857142858,
"grad_norm": 0.024809332564473152,
"kl": 0.002231597900390625,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0001,
"reward": -0.5435370802879333,
"reward_std": 0.1862776866182685,
"rewards/cosine_scaled_reward": -0.27176854759454727,
"rewards/format_reward": 0.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 2426.6250915527344,
"epoch": 0.18114285714285713,
"grad_norm": 0.04658813774585724,
"kl": 0.00171661376953125,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.1369,
"reward": 0.3155972547829151,
"reward_std": 0.8845781460404396,
"rewards/cosine_scaled_reward": -0.11303469305858016,
"rewards/format_reward": 0.5416666753590107,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 2817.9584350585938,
"epoch": 0.18171428571428572,
"grad_norm": 0.07211805135011673,
"kl": 0.00244140625,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.1614,
"reward": 0.275835745036602,
"reward_std": 0.9854978695511818,
"rewards/cosine_scaled_reward": -0.09124879166483879,
"rewards/format_reward": 0.4583333507180214,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 3308.8334350585938,
"epoch": 0.18228571428571427,
"grad_norm": 0.0264178067445755,
"kl": 0.0027303695678710938,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.1043,
"reward": 0.05568153224885464,
"reward_std": 0.8488932326436043,
"rewards/cosine_scaled_reward": -0.11799256131052971,
"rewards/format_reward": 0.2916666716337204,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 2949.5,
"epoch": 0.18285714285714286,
"grad_norm": 0.034236758947372437,
"kl": 0.0047817230224609375,
"learning_rate": 4.1094235253127374e-07,
"loss": -0.0153,
"reward": -0.12276215478777885,
"reward_std": 0.3791249468922615,
"rewards/cosine_scaled_reward": -0.18638107739388943,
"rewards/format_reward": 0.25,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 3338.4583740234375,
"epoch": 0.18342857142857144,
"grad_norm": 0.025821806862950325,
"kl": 0.00222015380859375,
"learning_rate": 4.079579333738039e-07,
"loss": 0.063,
"reward": -0.18705711886286736,
"reward_std": 0.4808242991566658,
"rewards/cosine_scaled_reward": -0.1560285510495305,
"rewards/format_reward": 0.125,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 3565.0833740234375,
"epoch": 0.184,
"grad_norm": 0.022971007972955704,
"kl": 0.002262115478515625,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0059,
"reward": -0.02773890271782875,
"reward_std": 0.7211907394230366,
"rewards/cosine_scaled_reward": -0.09720278717577457,
"rewards/format_reward": 0.1666666679084301,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 3341.416748046875,
"epoch": 0.18457142857142858,
"grad_norm": 0.022766903042793274,
"kl": 0.001781463623046875,
"learning_rate": 4.020100089676376e-07,
"loss": 0.1036,
"reward": -0.10803468152880669,
"reward_std": 0.6073896177113056,
"rewards/cosine_scaled_reward": -0.15818401239812374,
"rewards/format_reward": 0.2083333358168602,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 3376.3333740234375,
"epoch": 0.18514285714285714,
"grad_norm": 0.033231932669878006,
"kl": 0.0024585723876953125,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0075,
"reward": -0.34984915517270565,
"reward_std": 0.5265899635851383,
"rewards/cosine_scaled_reward": -0.2582579143345356,
"rewards/format_reward": 0.1666666679084301,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2811.208335876465,
"epoch": 0.18571428571428572,
"grad_norm": 0.09550298750400543,
"kl": 0.0038661956787109375,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0804,
"reward": -0.2852107435464859,
"reward_std": 0.19083382561802864,
"rewards/cosine_scaled_reward": -0.26760537922382355,
"rewards/format_reward": 0.25,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 2559.2916870117188,
"epoch": 0.18628571428571428,
"grad_norm": 0.07037398219108582,
"kl": 0.00316619873046875,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0938,
"reward": 0.3256427589803934,
"reward_std": 0.8185454159975052,
"rewards/cosine_scaled_reward": -0.06634528655558825,
"rewards/format_reward": 0.4583333358168602,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 3146.6666870117188,
"epoch": 0.18685714285714286,
"grad_norm": 0.048057761043310165,
"kl": 0.00212860107421875,
"learning_rate": 3.902018669163384e-07,
"loss": 0.1516,
"reward": -0.40305728605017066,
"reward_std": 0.17889815475791693,
"rewards/cosine_scaled_reward": -0.2848619734868407,
"rewards/format_reward": 0.1666666716337204,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 2758.9166870117188,
"epoch": 0.18742857142857142,
"grad_norm": 0.041575562208890915,
"kl": 0.0021953582763671875,
"learning_rate": 3.872689434630585e-07,
"loss": 0.1423,
"reward": -0.24416162073612213,
"reward_std": 0.4843718595802784,
"rewards/cosine_scaled_reward": -0.26791415363550186,
"rewards/format_reward": 0.2916666679084301,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 3113.5416870117188,
"epoch": 0.188,
"grad_norm": 0.02622011862695217,
"kl": 0.0013408660888671875,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0785,
"reward": 0.2258373498916626,
"reward_std": 0.9825378842651844,
"rewards/cosine_scaled_reward": -0.05374800169374794,
"rewards/format_reward": 0.3333333469927311,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 2908.750030517578,
"epoch": 0.18857142857142858,
"grad_norm": 0.04580061137676239,
"kl": 0.0025386810302734375,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.2083,
"reward": -0.30929698050022125,
"reward_std": 0.540601166896522,
"rewards/cosine_scaled_reward": -0.2796484977006912,
"rewards/format_reward": 0.2500000111758709,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 2569.791732788086,
"epoch": 0.18914285714285714,
"grad_norm": 0.04554865509271622,
"kl": 0.0025882720947265625,
"learning_rate": 3.785183306423767e-07,
"loss": 0.101,
"reward": 0.4928191304206848,
"reward_std": 0.23201470030471683,
"rewards/cosine_scaled_reward": 0.07974286749958992,
"rewards/format_reward": 0.3333333358168602,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 2199.7916870117188,
"epoch": 0.18971428571428572,
"grad_norm": 0.0294233076274395,
"kl": 0.00125885009765625,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0188,
"reward": 0.19253763556480408,
"reward_std": 0.5630042403936386,
"rewards/cosine_scaled_reward": -0.15373118966817856,
"rewards/format_reward": 0.5,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 3060.4583740234375,
"epoch": 0.19028571428571428,
"grad_norm": 0.06838904321193695,
"kl": 0.0027332305908203125,
"learning_rate": 3.72726140684072e-07,
"loss": -0.088,
"reward": 0.011348485946655273,
"reward_std": 0.6977972388267517,
"rewards/cosine_scaled_reward": -0.16099243192002177,
"rewards/format_reward": 0.3333333358168602,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 3204.1666870117188,
"epoch": 0.19085714285714286,
"grad_norm": 0.037621643394231796,
"kl": 0.0014801025390625,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0844,
"reward": -0.26741091907024384,
"reward_std": 0.4083873387426138,
"rewards/cosine_scaled_reward": -0.21703878417611122,
"rewards/format_reward": 0.1666666716337204,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 2231.0833740234375,
"epoch": 0.19142857142857142,
"grad_norm": 0.045068252831697464,
"kl": 0.0029354095458984375,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.2129,
"reward": 0.10812489595264196,
"reward_std": 0.42042385786771774,
"rewards/cosine_scaled_reward": -0.27927088737487793,
"rewards/format_reward": 0.666666679084301,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 2707.541748046875,
"epoch": 0.192,
"grad_norm": 0.03010060451924801,
"kl": 0.003017425537109375,
"learning_rate": 3.641030065789562e-07,
"loss": 0.1034,
"reward": 0.4937493186444044,
"reward_std": 0.9506418071687222,
"rewards/cosine_scaled_reward": -0.02395869791507721,
"rewards/format_reward": 0.5416666716337204,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 2758.4583587646484,
"epoch": 0.19257142857142856,
"grad_norm": 0.043119970709085464,
"kl": 0.00180816650390625,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0337,
"reward": 0.5623410455882549,
"reward_std": 0.900443509221077,
"rewards/cosine_scaled_reward": 0.07283718138933182,
"rewards/format_reward": 0.4166666679084301,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2846.9583740234375,
"epoch": 0.19314285714285714,
"grad_norm": 0.04690620303153992,
"kl": 0.0030651092529296875,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0839,
"reward": 0.1852373331785202,
"reward_std": 1.1534279137849808,
"rewards/cosine_scaled_reward": -0.11571469902992249,
"rewards/format_reward": 0.4166666679084301,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.19371428571428573,
"grad_norm": 0.0250700693577528,
"kl": 0.002323150634765625,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0001,
"reward": -0.5875216303393245,
"reward_std": 0.16191167384386063,
"rewards/cosine_scaled_reward": -0.29376082262024283,
"rewards/format_reward": 0.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 2209.250030517578,
"epoch": 0.19428571428571428,
"grad_norm": 0.04741514474153519,
"kl": 0.00539398193359375,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0141,
"reward": 0.3237254023551941,
"reward_std": 0.3493252918124199,
"rewards/cosine_scaled_reward": -0.08813730999827385,
"rewards/format_reward": 0.5,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 1355.5417022705078,
"epoch": 0.19485714285714287,
"grad_norm": 0.03962629660964012,
"kl": 0.0014257431030273438,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.1484,
"reward": 0.2004435583949089,
"reward_std": 0.4287584722042084,
"rewards/cosine_scaled_reward": -0.31644489243626595,
"rewards/format_reward": 0.8333333432674408,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 2390.9583435058594,
"epoch": 0.19542857142857142,
"grad_norm": 0.07413430511951447,
"kl": 0.0021076202392578125,
"learning_rate": 3.471051066897562e-07,
"loss": 0.1456,
"reward": 0.04607273545116186,
"reward_std": 0.5195755325257778,
"rewards/cosine_scaled_reward": -0.18529696762561798,
"rewards/format_reward": 0.4166666716337204,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 3195.8333740234375,
"epoch": 0.196,
"grad_norm": 0.04542411491274834,
"kl": 0.0027923583984375,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.1701,
"reward": -0.25569210201501846,
"reward_std": 0.7485100533813238,
"rewards/cosine_scaled_reward": -0.23201273381710052,
"rewards/format_reward": 0.2083333395421505,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 2487.000030517578,
"epoch": 0.19657142857142856,
"grad_norm": 0.09508878737688065,
"kl": 0.002880096435546875,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.1892,
"reward": 0.1885826736688614,
"reward_std": 0.491073552519083,
"rewards/cosine_scaled_reward": -0.0932086780667305,
"rewards/format_reward": 0.375,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 1677.1666870117188,
"epoch": 0.19714285714285715,
"grad_norm": 0.04884251952171326,
"kl": 0.0024852752685546875,
"learning_rate": 3.387377967463493e-07,
"loss": -0.0392,
"reward": 0.655265687033534,
"reward_std": 0.5575139056891203,
"rewards/cosine_scaled_reward": -0.005700506269931793,
"rewards/format_reward": 0.6666666716337204,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1977142857142857,
"grad_norm": 0.02431817352771759,
"kl": 0.0021581649780273438,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0001,
"reward": -0.4340847618877888,
"reward_std": 0.24131110310554504,
"rewards/cosine_scaled_reward": -0.2587090525776148,
"rewards/format_reward": 0.0833333358168602,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 3101.2083435058594,
"epoch": 0.1982857142857143,
"grad_norm": 0.029230665415525436,
"kl": 0.001796722412109375,
"learning_rate": 3.3321084665422803e-07,
"loss": -0.0451,
"reward": -0.037752747535705566,
"reward_std": 0.5485346168279648,
"rewards/cosine_scaled_reward": -0.14387639239430428,
"rewards/format_reward": 0.25,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 3108.8333740234375,
"epoch": 0.19885714285714284,
"grad_norm": 0.03228713572025299,
"kl": 0.0021820068359375,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0192,
"reward": -0.13820890709757805,
"reward_std": 0.6344441249966621,
"rewards/cosine_scaled_reward": -0.21493778750300407,
"rewards/format_reward": 0.291666679084301,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 3467.7083740234375,
"epoch": 0.19942857142857143,
"grad_norm": 0.03689376264810562,
"kl": 0.00193023681640625,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0277,
"reward": -0.0664571076631546,
"reward_std": 0.611913550645113,
"rewards/cosine_scaled_reward": -0.1373952403664589,
"rewards/format_reward": 0.2083333395421505,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 2246.0833435058594,
"epoch": 0.2,
"grad_norm": 0.028090570122003555,
"kl": 0.0030994415283203125,
"learning_rate": 3.250000000000001e-07,
"loss": 0.066,
"reward": 0.7716233059763908,
"reward_std": 0.3056010231375694,
"rewards/cosine_scaled_reward": 0.13581165112555027,
"rewards/format_reward": 0.5,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 2734.041748046875,
"epoch": 0.20057142857142857,
"grad_norm": 0.07741723209619522,
"kl": 0.013034820556640625,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0527,
"reward": 0.2801409657113254,
"reward_std": 1.3329923450946808,
"rewards/cosine_scaled_reward": -0.10992954391986132,
"rewards/format_reward": 0.5000000037252903,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 2575.8333587646484,
"epoch": 0.20114285714285715,
"grad_norm": 0.046926870942115784,
"kl": 0.00290679931640625,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0756,
"reward": 0.39545369893312454,
"reward_std": 0.8376414366066456,
"rewards/cosine_scaled_reward": -0.03143983148038387,
"rewards/format_reward": 0.4583333395421505,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 2603.916748046875,
"epoch": 0.2017142857142857,
"grad_norm": 0.03367123752832413,
"kl": 0.00176239013671875,
"learning_rate": 3.168878457820915e-07,
"loss": 0.142,
"reward": 0.08237806335091591,
"reward_std": 0.5855906754732132,
"rewards/cosine_scaled_reward": -0.2296443209052086,
"rewards/format_reward": 0.541666679084301,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 2574.5416870117188,
"epoch": 0.2022857142857143,
"grad_norm": 0.024790968745946884,
"kl": 0.001708984375,
"learning_rate": 3.142063423134644e-07,
"loss": -0.0479,
"reward": 0.29116785526275635,
"reward_std": 0.5448461137712002,
"rewards/cosine_scaled_reward": -0.10441606491804123,
"rewards/format_reward": 0.5,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 2699.500030517578,
"epoch": 0.20285714285714285,
"grad_norm": 0.038337625563144684,
"kl": 0.0048809051513671875,
"learning_rate": 3.115363310950578e-07,
"loss": -0.0513,
"reward": 0.25913889706134796,
"reward_std": 0.282584385946393,
"rewards/cosine_scaled_reward": -0.09959721937775612,
"rewards/format_reward": 0.4583333432674408,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 2483.125,
"epoch": 0.20342857142857143,
"grad_norm": 0.08116474747657776,
"kl": 0.00202178955078125,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.2336,
"reward": 0.8199636042118073,
"reward_std": 0.6027325298637152,
"rewards/cosine_scaled_reward": 0.18081511557102203,
"rewards/format_reward": 0.4583333432674408,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 2844.500030517578,
"epoch": 0.204,
"grad_norm": 0.025603273883461952,
"kl": 0.0018148422241210938,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0806,
"reward": -0.39298439770936966,
"reward_std": 0.20228938292711973,
"rewards/cosine_scaled_reward": -0.36315886676311493,
"rewards/format_reward": 0.3333333358168602,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 2851.416717529297,
"epoch": 0.20457142857142857,
"grad_norm": 0.028645722195506096,
"kl": 0.003147125244140625,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.0701,
"reward": 0.35892677307128906,
"reward_std": 0.593839131295681,
"rewards/cosine_scaled_reward": -0.008036583662033081,
"rewards/format_reward": 0.375,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 2595.4583740234375,
"epoch": 0.20514285714285715,
"grad_norm": 0.05657227337360382,
"kl": 0.0027923583984375,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.1043,
"reward": 0.015408764127641916,
"reward_std": 0.68564473092556,
"rewards/cosine_scaled_reward": -0.26312895119190216,
"rewards/format_reward": 0.5416666716337204,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2057142857142857,
"grad_norm": 0.02520536072552204,
"kl": 0.001850128173828125,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0001,
"reward": -0.43168380856513977,
"reward_std": 0.32490337640047073,
"rewards/cosine_scaled_reward": -0.2366752326488495,
"rewards/format_reward": 0.0416666679084301,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 3096.291717529297,
"epoch": 0.2062857142857143,
"grad_norm": 0.02363760955631733,
"kl": 0.00234222412109375,
"learning_rate": 2.9576484845877793e-07,
"loss": -0.0555,
"reward": 0.39518995955586433,
"reward_std": 0.4662705436348915,
"rewards/cosine_scaled_reward": 0.03092828392982483,
"rewards/format_reward": 0.3333333358168602,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 3573.4583740234375,
"epoch": 0.20685714285714285,
"grad_norm": 0.02601708471775055,
"kl": 0.001556396484375,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0033,
"reward": -0.3869195654988289,
"reward_std": 0.4128815419971943,
"rewards/cosine_scaled_reward": -0.2976264376193285,
"rewards/format_reward": 0.2083333358168602,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 2522.8333740234375,
"epoch": 0.20742857142857143,
"grad_norm": 0.04574335739016533,
"kl": 0.008260726928710938,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.1671,
"reward": 0.26529842615127563,
"reward_std": 0.7703392207622528,
"rewards/cosine_scaled_reward": -0.11735078692436218,
"rewards/format_reward": 0.5000000037252903,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 3236.0416870117188,
"epoch": 0.208,
"grad_norm": 0.027814846485853195,
"kl": 0.0020885467529296875,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0791,
"reward": -0.2710934332571924,
"reward_std": 0.7278010919690132,
"rewards/cosine_scaled_reward": -0.21888004802167416,
"rewards/format_reward": 0.1666666679084301,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 2770.166778564453,
"epoch": 0.20857142857142857,
"grad_norm": 0.04530521109700203,
"kl": 0.0047168731689453125,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0958,
"reward": 0.03753130603581667,
"reward_std": 0.5634891390800476,
"rewards/cosine_scaled_reward": -0.23123434651643038,
"rewards/format_reward": 0.5000000037252903,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 3463.6250610351562,
"epoch": 0.20914285714285713,
"grad_norm": 0.026811106130480766,
"kl": 0.002040863037109375,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0349,
"reward": -0.44002775847911835,
"reward_std": 0.3063788739964366,
"rewards/cosine_scaled_reward": -0.2825138680636883,
"rewards/format_reward": 0.1250000037252903,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 2256.500030517578,
"epoch": 0.20971428571428571,
"grad_norm": 0.03977304697036743,
"kl": 0.0020389556884765625,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.1914,
"reward": 0.6434868378564715,
"reward_std": 1.067148432135582,
"rewards/cosine_scaled_reward": 0.05091007053852081,
"rewards/format_reward": 0.541666679084301,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 2829.750030517578,
"epoch": 0.2102857142857143,
"grad_norm": 0.04306294396519661,
"kl": 0.003154754638671875,
"learning_rate": 2.7793039831193133e-07,
"loss": -0.0049,
"reward": 0.6332775764167309,
"reward_std": 1.3392103165388107,
"rewards/cosine_scaled_reward": 0.08747210912406445,
"rewards/format_reward": 0.4583333395421505,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 3277.0,
"epoch": 0.21085714285714285,
"grad_norm": 0.03245609253644943,
"kl": 0.0017871856689453125,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0693,
"reward": -0.3826030343770981,
"reward_std": 0.47108511719852686,
"rewards/cosine_scaled_reward": -0.27463484834879637,
"rewards/format_reward": 0.1666666716337204,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 3560.1666870117188,
"epoch": 0.21142857142857144,
"grad_norm": 0.02579626441001892,
"kl": 0.002105712890625,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0077,
"reward": -0.3091103732585907,
"reward_std": 0.7164483182132244,
"rewards/cosine_scaled_reward": -0.23788851499557495,
"rewards/format_reward": 0.1666666679084301,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 2541.4167098999023,
"epoch": 0.212,
"grad_norm": 0.03733627125620842,
"kl": 0.001705169677734375,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0138,
"reward": 0.672270392999053,
"reward_std": 0.9413716346025467,
"rewards/cosine_scaled_reward": 0.04446852207183838,
"rewards/format_reward": 0.5833333395421505,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 3581.3333740234375,
"epoch": 0.21257142857142858,
"grad_norm": 0.027517346665263176,
"kl": 0.0018329620361328125,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0012,
"reward": -0.2805359922349453,
"reward_std": 0.5577078722417355,
"rewards/cosine_scaled_reward": -0.1819346728734672,
"rewards/format_reward": 0.0833333358168602,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 2894.791732788086,
"epoch": 0.21314285714285713,
"grad_norm": 0.02603447064757347,
"kl": 0.0014095306396484375,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0451,
"reward": 0.4043912701308727,
"reward_std": 0.8573378399014473,
"rewards/cosine_scaled_reward": 0.014695605263113976,
"rewards/format_reward": 0.3750000037252903,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.21371428571428572,
"grad_norm": 0.025763656944036484,
"kl": 0.0022430419921875,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0001,
"reward": -0.6632598042488098,
"reward_std": 0.41906454414129257,
"rewards/cosine_scaled_reward": -0.3524632453918457,
"rewards/format_reward": 0.0416666679084301,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 2513.6666870117188,
"epoch": 0.21428571428571427,
"grad_norm": 0.04218330234289169,
"kl": 0.002178192138671875,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.1544,
"reward": 0.47941452637314796,
"reward_std": 0.7010052129626274,
"rewards/cosine_scaled_reward": -0.010292727500200272,
"rewards/format_reward": 0.5000000223517418,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 3257.0833740234375,
"epoch": 0.21485714285714286,
"grad_norm": 0.024601558223366737,
"kl": 0.002254486083984375,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0772,
"reward": 0.11639352887868881,
"reward_std": 0.771907739341259,
"rewards/cosine_scaled_reward": -0.1293032318353653,
"rewards/format_reward": 0.3750000074505806,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 2682.0833740234375,
"epoch": 0.21542857142857144,
"grad_norm": 0.034084804356098175,
"kl": 0.003185272216796875,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0269,
"reward": 0.19540098868310452,
"reward_std": 0.7716350331902504,
"rewards/cosine_scaled_reward": -0.13146617216989398,
"rewards/format_reward": 0.4583333395421505,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 3114.6666870117188,
"epoch": 0.216,
"grad_norm": 0.03742056339979172,
"kl": 0.00238800048828125,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0475,
"reward": -0.26397572830319405,
"reward_std": 0.42522556707262993,
"rewards/cosine_scaled_reward": -0.27782120555639267,
"rewards/format_reward": 0.291666679084301,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 3018.0,
"epoch": 0.21657142857142858,
"grad_norm": 0.041126009076833725,
"kl": 0.00194549560546875,
"learning_rate": 2.512332043064913e-07,
"loss": 0.1509,
"reward": -0.38718332070857286,
"reward_std": 0.12089738715440035,
"rewards/cosine_scaled_reward": -0.29775832826271653,
"rewards/format_reward": 0.2083333432674408,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 2921.5833740234375,
"epoch": 0.21714285714285714,
"grad_norm": 0.05002846568822861,
"kl": 0.0018787384033203125,
"learning_rate": 2.488912271385139e-07,
"loss": 0.1916,
"reward": 0.08173270896077156,
"reward_std": 1.0889613926410675,
"rewards/cosine_scaled_reward": -0.10496698506176472,
"rewards/format_reward": 0.291666679084301,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 2700.0833435058594,
"epoch": 0.21771428571428572,
"grad_norm": 0.07720932364463806,
"kl": 0.003021240234375,
"learning_rate": 2.465639255873246e-07,
"loss": 0.1772,
"reward": -0.28823885321617126,
"reward_std": 0.4211480915546417,
"rewards/cosine_scaled_reward": -0.33161942660808563,
"rewards/format_reward": 0.375,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 3033.9166870117188,
"epoch": 0.21828571428571428,
"grad_norm": 0.053646184504032135,
"kl": 0.002429962158203125,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.1777,
"reward": -0.4409410636872053,
"reward_std": 0.40626726672053337,
"rewards/cosine_scaled_reward": -0.32463720440864563,
"rewards/format_reward": 0.2083333358168602,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 2973.4584045410156,
"epoch": 0.21885714285714286,
"grad_norm": 0.030945565551519394,
"kl": 0.00351715087890625,
"learning_rate": 2.4195380233209006e-07,
"loss": -0.0247,
"reward": 0.33793809020426124,
"reward_std": 0.30829140171408653,
"rewards/cosine_scaled_reward": -0.018530961126089096,
"rewards/format_reward": 0.3750000037252903,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 3457.3750610351562,
"epoch": 0.21942857142857142,
"grad_norm": 0.02838779054582119,
"kl": 0.0016632080078125,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0511,
"reward": 0.03136183321475983,
"reward_std": 0.49431246519088745,
"rewards/cosine_scaled_reward": -0.10931909084320068,
"rewards/format_reward": 0.2500000111758709,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 2602.4166870117188,
"epoch": 0.22,
"grad_norm": 0.05043185502290726,
"kl": 0.003314971923828125,
"learning_rate": 2.374037332934512e-07,
"loss": 0.036,
"reward": 0.03084481880068779,
"reward_std": 0.7456474676728249,
"rewards/cosine_scaled_reward": -0.21374426037073135,
"rewards/format_reward": 0.4583333395421505,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 3196.041748046875,
"epoch": 0.22057142857142858,
"grad_norm": 0.04115245118737221,
"kl": 0.002292633056640625,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0747,
"reward": -0.1912713162600994,
"reward_std": 0.44368572533130646,
"rewards/cosine_scaled_reward": -0.26230233535170555,
"rewards/format_reward": 0.3333333469927311,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 2067.666717529297,
"epoch": 0.22114285714285714,
"grad_norm": 0.08692646771669388,
"kl": 0.0031585693359375,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.1591,
"reward": 1.0775700695812702,
"reward_std": 1.0433802232146263,
"rewards/cosine_scaled_reward": 0.2471183855086565,
"rewards/format_reward": 0.5833333358168602,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 1669.9167175292969,
"epoch": 0.22171428571428572,
"grad_norm": 0.055517517030239105,
"kl": 0.0025768280029296875,
"learning_rate": 2.306931685585657e-07,
"loss": 0.2493,
"reward": 1.2293222844600677,
"reward_std": 0.980612438172102,
"rewards/cosine_scaled_reward": 0.17716114781796932,
"rewards/format_reward": 0.8750000149011612,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 1695.708351135254,
"epoch": 0.22228571428571428,
"grad_norm": 0.03845082223415375,
"kl": 0.0021047592163085938,
"learning_rate": 2.2848729416523859e-07,
"loss": -0.0207,
"reward": 1.20601287484169,
"reward_std": 0.6396824978291988,
"rewards/cosine_scaled_reward": 0.22800641134381294,
"rewards/format_reward": 0.75,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 3062.7916870117188,
"epoch": 0.22285714285714286,
"grad_norm": 0.06756512820720673,
"kl": 0.0024547576904296875,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.1465,
"reward": -0.3023668974637985,
"reward_std": 0.3118130564689636,
"rewards/cosine_scaled_reward": -0.25535011664032936,
"rewards/format_reward": 0.2083333432674408,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 3022.2083740234375,
"epoch": 0.22342857142857142,
"grad_norm": 0.0756201446056366,
"kl": 0.0026340484619140625,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.1691,
"reward": -0.23833123594522476,
"reward_std": 0.2127583883702755,
"rewards/cosine_scaled_reward": -0.2649989537894726,
"rewards/format_reward": 0.291666679084301,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 3207.2083740234375,
"epoch": 0.224,
"grad_norm": 0.03767884895205498,
"kl": 0.001995086669921875,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0723,
"reward": -0.12611797451972961,
"reward_std": 0.36987370252609253,
"rewards/cosine_scaled_reward": -0.1672256588935852,
"rewards/format_reward": 0.2083333432674408,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 3324.4583740234375,
"epoch": 0.22457142857142856,
"grad_norm": 0.04080076143145561,
"kl": 0.0020618438720703125,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.09,
"reward": -0.6663314551115036,
"reward_std": 0.18218804895877838,
"rewards/cosine_scaled_reward": -0.3956657275557518,
"rewards/format_reward": 0.125,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 2578.0000610351562,
"epoch": 0.22514285714285714,
"grad_norm": 0.041851166635751724,
"kl": 0.003665924072265625,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.1105,
"reward": 0.41171833069529384,
"reward_std": 0.7391864433884621,
"rewards/cosine_scaled_reward": 0.01835917867720127,
"rewards/format_reward": 0.3750000037252903,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 3510.2083740234375,
"epoch": 0.2257142857142857,
"grad_norm": 0.025029897689819336,
"kl": 0.002216339111328125,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0455,
"reward": -0.386491596698761,
"reward_std": 0.4648677408695221,
"rewards/cosine_scaled_reward": -0.21407912857830524,
"rewards/format_reward": 0.0416666679084301,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 3108.7083435058594,
"epoch": 0.22628571428571428,
"grad_norm": 0.0395159013569355,
"kl": 0.0013723373413085938,
"learning_rate": 2.134908592756607e-07,
"loss": 0.1131,
"reward": 0.20225340873003006,
"reward_std": 0.4511542022228241,
"rewards/cosine_scaled_reward": -0.06553995236754417,
"rewards/format_reward": 0.3333333432674408,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 2810.5000610351562,
"epoch": 0.22685714285714287,
"grad_norm": 0.10683666169643402,
"kl": 0.002685546875,
"learning_rate": 2.1141329099692406e-07,
"loss": -0.1331,
"reward": 0.5581239182502031,
"reward_std": 0.7452297080308199,
"rewards/cosine_scaled_reward": 0.04989526513963938,
"rewards/format_reward": 0.4583333432674408,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 3000.0416870117188,
"epoch": 0.22742857142857142,
"grad_norm": 0.042328979820013046,
"kl": 0.0017576217651367188,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0136,
"reward": 0.4953223764896393,
"reward_std": 0.475642804056406,
"rewards/cosine_scaled_reward": 0.01849452033638954,
"rewards/format_reward": 0.4583333395421505,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 3267.0416870117188,
"epoch": 0.228,
"grad_norm": 0.038258083164691925,
"kl": 0.004131317138671875,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0758,
"reward": -0.3403526246547699,
"reward_std": 0.40614739432930946,
"rewards/cosine_scaled_reward": -0.2535096574574709,
"rewards/format_reward": 0.1666666716337204,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 1433.2916717529297,
"epoch": 0.22857142857142856,
"grad_norm": 0.04750688746571541,
"kl": 0.0027313232421875,
"learning_rate": 2.0528000059645995e-07,
"loss": -0.0022,
"reward": 1.8511969447135925,
"reward_std": 0.6095073223114014,
"rewards/cosine_scaled_reward": 0.46726512676104903,
"rewards/format_reward": 0.9166666716337204,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 2776.541748046875,
"epoch": 0.22914285714285715,
"grad_norm": 0.04821674898266792,
"kl": 0.01263427734375,
"learning_rate": 2.032690407508949e-07,
"loss": -0.0218,
"reward": 0.21085133403539658,
"reward_std": 0.7990075498819351,
"rewards/cosine_scaled_reward": -0.14457433423376642,
"rewards/format_reward": 0.5000000074505806,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 2849.5833435058594,
"epoch": 0.2297142857142857,
"grad_norm": 0.03391491249203682,
"kl": 0.001735687255859375,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0193,
"reward": -0.31716519594192505,
"reward_std": 0.26038678735494614,
"rewards/cosine_scaled_reward": -0.2835825942456722,
"rewards/format_reward": 0.25,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 2503.9583740234375,
"epoch": 0.2302857142857143,
"grad_norm": 0.045575402677059174,
"kl": 0.00244903564453125,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.1805,
"reward": 0.4825909286737442,
"reward_std": 0.8773414939641953,
"rewards/cosine_scaled_reward": -0.05037122219800949,
"rewards/format_reward": 0.5833333358168602,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 2744.7083435058594,
"epoch": 0.23085714285714284,
"grad_norm": 0.057386625558137894,
"kl": 0.002567291259765625,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0864,
"reward": -0.24875595793128014,
"reward_std": 0.24890969693660736,
"rewards/cosine_scaled_reward": -0.27021132223308086,
"rewards/format_reward": 0.2916666679084301,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 2514.4167098999023,
"epoch": 0.23142857142857143,
"grad_norm": 0.044321611523628235,
"kl": 0.0018768310546875,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.1052,
"reward": 0.3779886527918279,
"reward_std": 0.23864592984318733,
"rewards/cosine_scaled_reward": 0.0014943182468414307,
"rewards/format_reward": 0.375,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 3513.9166870117188,
"epoch": 0.232,
"grad_norm": 0.0408383384346962,
"kl": 0.003757476806640625,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0411,
"reward": -0.4136723382398486,
"reward_std": 0.815041683614254,
"rewards/cosine_scaled_reward": -0.24850285053253174,
"rewards/format_reward": 0.0833333358168602,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 2799.125,
"epoch": 0.23257142857142857,
"grad_norm": 0.03454533591866493,
"kl": 0.00251007080078125,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0384,
"reward": 0.5007706619799137,
"reward_std": 0.7364890426397324,
"rewards/cosine_scaled_reward": 0.06288533098995686,
"rewards/format_reward": 0.375,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 2209.7083740234375,
"epoch": 0.23314285714285715,
"grad_norm": 0.0733719989657402,
"kl": 0.005035400390625,
"learning_rate": 1.8967088307307e-07,
"loss": 0.3212,
"reward": 0.500348687171936,
"reward_std": 0.9539323709905148,
"rewards/cosine_scaled_reward": -0.020659001544117928,
"rewards/format_reward": 0.5416666865348816,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 3269.5833740234375,
"epoch": 0.2337142857142857,
"grad_norm": 0.05440355837345123,
"kl": 0.0030803680419921875,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.1228,
"reward": -0.007524088025093079,
"reward_std": 0.8484572544693947,
"rewards/cosine_scaled_reward": -0.12876203656196594,
"rewards/format_reward": 0.2500000074505806,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 3360.75,
"epoch": 0.2342857142857143,
"grad_norm": 0.02374698594212532,
"kl": 0.001850128173828125,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0121,
"reward": -0.16027933359146118,
"reward_std": 0.8020299822092056,
"rewards/cosine_scaled_reward": -0.2051396742463112,
"rewards/format_reward": 0.2500000074505806,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 3488.7083740234375,
"epoch": 0.23485714285714285,
"grad_norm": 0.023548219352960587,
"kl": 0.00241851806640625,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0373,
"reward": -0.030248835682868958,
"reward_std": 1.0068029686808586,
"rewards/cosine_scaled_reward": -0.14012442249804735,
"rewards/format_reward": 0.2500000037252903,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 3375.25,
"epoch": 0.23542857142857143,
"grad_norm": 0.03655938804149628,
"kl": 0.0019073486328125,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0973,
"reward": -0.5976834818720818,
"reward_std": 0.213936235755682,
"rewards/cosine_scaled_reward": -0.340508408844471,
"rewards/format_reward": 0.0833333358168602,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 2894.6250610351562,
"epoch": 0.236,
"grad_norm": 0.026368409395217896,
"kl": 0.001476287841796875,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0023,
"reward": 0.23069452494382858,
"reward_std": 1.1981160640716553,
"rewards/cosine_scaled_reward": -0.0929860845208168,
"rewards/format_reward": 0.416666679084301,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 2836.0,
"epoch": 0.23657142857142857,
"grad_norm": 0.0354296937584877,
"kl": 0.002132415771484375,
"learning_rate": 1.7869892577476722e-07,
"loss": -0.0273,
"reward": -0.009219972416758537,
"reward_std": 0.33009819500148296,
"rewards/cosine_scaled_reward": -0.12960998620837927,
"rewards/format_reward": 0.25,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 2803.2916870117188,
"epoch": 0.23714285714285716,
"grad_norm": 0.0818825215101242,
"kl": 0.00244903564453125,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.06,
"reward": 0.01829184591770172,
"reward_std": 0.5228419750928879,
"rewards/cosine_scaled_reward": -0.17835407704114914,
"rewards/format_reward": 0.375,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2377142857142857,
"grad_norm": 0.035418182611465454,
"kl": 0.001995086669921875,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0001,
"reward": -0.30091510340571404,
"reward_std": 0.13776107877492905,
"rewards/cosine_scaled_reward": -0.15045755077153444,
"rewards/format_reward": 0.0,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 2493.12504196167,
"epoch": 0.2382857142857143,
"grad_norm": 0.04779450222849846,
"kl": 0.0051097869873046875,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0567,
"reward": 0.5757415145635605,
"reward_std": 0.3146047620102763,
"rewards/cosine_scaled_reward": 0.05870402604341507,
"rewards/format_reward": 0.4583333432674408,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 2410.0,
"epoch": 0.23885714285714285,
"grad_norm": 0.06584033370018005,
"kl": 0.003627777099609375,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.1005,
"reward": -0.14499032497406006,
"reward_std": 0.47409145161509514,
"rewards/cosine_scaled_reward": -0.32249519135802984,
"rewards/format_reward": 0.5000000111758709,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 2817.8333740234375,
"epoch": 0.23942857142857144,
"grad_norm": 0.0567738376557827,
"kl": 0.002117156982421875,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.1418,
"reward": -0.03458383772522211,
"reward_std": 0.32715121656656265,
"rewards/cosine_scaled_reward": -0.22562525561079383,
"rewards/format_reward": 0.4166666865348816,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 1814.5000228881836,
"epoch": 0.24,
"grad_norm": 0.05227474868297577,
"kl": 0.0027008056640625,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.1518,
"reward": 0.9874219074845314,
"reward_std": 0.5618811529129744,
"rewards/cosine_scaled_reward": 0.16037755832076073,
"rewards/format_reward": 0.6666666716337204,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 3262.9583740234375,
"epoch": 0.24057142857142857,
"grad_norm": 0.04826692119240761,
"kl": 0.002506256103515625,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0562,
"reward": -0.2657657153904438,
"reward_std": 0.6167314350605011,
"rewards/cosine_scaled_reward": -0.237049525603652,
"rewards/format_reward": 0.2083333395421505,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24114285714285713,
"grad_norm": 0.021828195080161095,
"kl": 0.0014743804931640625,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0001,
"reward": -0.3239707574248314,
"reward_std": 0.5209229327738285,
"rewards/cosine_scaled_reward": -0.20365203730762005,
"rewards/format_reward": 0.0833333358168602,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 2579.916717529297,
"epoch": 0.24171428571428571,
"grad_norm": 0.04154634848237038,
"kl": 0.0044078826904296875,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.1208,
"reward": 0.25859048834536225,
"reward_std": 0.9602373689413071,
"rewards/cosine_scaled_reward": -0.0998714342713356,
"rewards/format_reward": 0.4583333507180214,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 3340.7916870117188,
"epoch": 0.2422857142857143,
"grad_norm": 0.024497682228684425,
"kl": 0.002574920654296875,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0477,
"reward": -0.34070008620619774,
"reward_std": 0.5686692576855421,
"rewards/cosine_scaled_reward": -0.2745167203247547,
"rewards/format_reward": 0.2083333395421505,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24285714285714285,
"grad_norm": 0.02335195802152157,
"kl": 0.0016880035400390625,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0001,
"reward": -0.7255350053310394,
"reward_std": 0.11841294169425964,
"rewards/cosine_scaled_reward": -0.3627674952149391,
"rewards/format_reward": 0.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 3284.9166870117188,
"epoch": 0.24342857142857144,
"grad_norm": 0.02628624066710472,
"kl": 0.00174713134765625,
"learning_rate": 1.5872728172265146e-07,
"loss": -0.0484,
"reward": 0.16440905630588531,
"reward_std": 0.17891278490424156,
"rewards/cosine_scaled_reward": -0.042795535176992416,
"rewards/format_reward": 0.25,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 3153.0000610351562,
"epoch": 0.244,
"grad_norm": 0.025881120935082436,
"kl": 0.0017852783203125,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0466,
"reward": -0.14761827141046524,
"reward_std": 0.43717267736792564,
"rewards/cosine_scaled_reward": -0.2613091245293617,
"rewards/format_reward": 0.3750000111758709,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 2877.7083740234375,
"epoch": 0.24457142857142858,
"grad_norm": 0.028311334550380707,
"kl": 0.0018310546875,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0378,
"reward": 0.324504092335701,
"reward_std": 0.9603755623102188,
"rewards/cosine_scaled_reward": -0.0669146329164505,
"rewards/format_reward": 0.4583333358168602,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 3187.0833435058594,
"epoch": 0.24514285714285713,
"grad_norm": 0.06461922824382782,
"kl": 0.00226593017578125,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.1252,
"reward": -0.49600089713931084,
"reward_std": 0.2602513749152422,
"rewards/cosine_scaled_reward": -0.33133377879858017,
"rewards/format_reward": 0.1666666716337204,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 3474.1250610351562,
"epoch": 0.24571428571428572,
"grad_norm": 0.03904551640152931,
"kl": 0.0019245147705078125,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0228,
"reward": 0.08353859186172485,
"reward_std": 0.9899476245045662,
"rewards/cosine_scaled_reward": -0.08323072176426649,
"rewards/format_reward": 0.25,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24628571428571427,
"grad_norm": 0.021346671506762505,
"kl": 0.00138092041015625,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0001,
"reward": -0.5528823342174292,
"reward_std": 0.15007145144045353,
"rewards/cosine_scaled_reward": -0.2764411820098758,
"rewards/format_reward": 0.0,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 3018.125,
"epoch": 0.24685714285714286,
"grad_norm": 0.0652502253651619,
"kl": 0.0072345733642578125,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.1098,
"reward": -0.18478704243898392,
"reward_std": 0.3417491838335991,
"rewards/cosine_scaled_reward": -0.19656019657850266,
"rewards/format_reward": 0.2083333432674408,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 2424.2500610351562,
"epoch": 0.24742857142857144,
"grad_norm": 0.05150126665830612,
"kl": 0.002414703369140625,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0398,
"reward": 0.8158954866230488,
"reward_std": 0.4261186718940735,
"rewards/cosine_scaled_reward": 0.09544774331152439,
"rewards/format_reward": 0.6250000149011612,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 3382.7084350585938,
"epoch": 0.248,
"grad_norm": 0.029949190095067024,
"kl": 0.0025386810302734375,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0438,
"reward": 0.3182661309838295,
"reward_std": 0.6137006804347038,
"rewards/cosine_scaled_reward": 0.013299711048603058,
"rewards/format_reward": 0.291666679084301,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 2843.5,
"epoch": 0.24857142857142858,
"grad_norm": 0.03288929536938667,
"kl": 0.00148773193359375,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.1054,
"reward": 0.33230017125606537,
"reward_std": 0.22668367251753807,
"rewards/cosine_scaled_reward": 0.041150085628032684,
"rewards/format_reward": 0.25,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 2787.250045776367,
"epoch": 0.24914285714285714,
"grad_norm": 0.06184900179505348,
"kl": 0.002155303955078125,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.1594,
"reward": 0.6363462656736374,
"reward_std": 0.9836109708994627,
"rewards/cosine_scaled_reward": 0.06817315006628633,
"rewards/format_reward": 0.5000000223517418,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 1811.9584350585938,
"epoch": 0.24971428571428572,
"grad_norm": 0.05618860945105553,
"kl": 0.00218963623046875,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.3849,
"reward": 0.3208237960934639,
"reward_std": 0.8114956840872765,
"rewards/cosine_scaled_reward": -0.17292143777012825,
"rewards/format_reward": 0.666666679084301,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 2738.3333740234375,
"epoch": 0.2502857142857143,
"grad_norm": 0.02950606867671013,
"kl": 0.0029239654541015625,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.1087,
"reward": -0.003797575831413269,
"reward_std": 0.6448537092655897,
"rewards/cosine_scaled_reward": -0.21023213118314743,
"rewards/format_reward": 0.4166666865348816,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 2839.5833435058594,
"epoch": 0.25085714285714283,
"grad_norm": 0.05315336957573891,
"kl": 0.0044994354248046875,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.1635,
"reward": -0.20786919817328453,
"reward_std": 0.41989522241055965,
"rewards/cosine_scaled_reward": -0.2497679367661476,
"rewards/format_reward": 0.2916666716337204,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 2265.375030517578,
"epoch": 0.25142857142857145,
"grad_norm": 0.030458109453320503,
"kl": 0.0027713775634765625,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0069,
"reward": 0.7619366645812988,
"reward_std": 0.7980624493211508,
"rewards/cosine_scaled_reward": 0.026801645755767822,
"rewards/format_reward": 0.7083333432674408,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 3560.3333740234375,
"epoch": 0.252,
"grad_norm": 0.023687317967414856,
"kl": 0.0019683837890625,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0138,
"reward": -0.44265176355838776,
"reward_std": 0.44561421498656273,
"rewards/cosine_scaled_reward": -0.24215921759605408,
"rewards/format_reward": 0.0416666679084301,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 2647.375045776367,
"epoch": 0.25257142857142856,
"grad_norm": 0.039222314953804016,
"kl": 0.00183868408203125,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0377,
"reward": 0.7825583815574646,
"reward_std": 0.5142037309706211,
"rewards/cosine_scaled_reward": 0.1621125414967537,
"rewards/format_reward": 0.4583333432674408,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 3567.75,
"epoch": 0.25314285714285717,
"grad_norm": 0.022901169955730438,
"kl": 0.0019474029541015625,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0091,
"reward": -0.6635128557682037,
"reward_std": 0.26263442169874907,
"rewards/cosine_scaled_reward": -0.35258975625038147,
"rewards/format_reward": 0.0416666679084301,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 3576.8333740234375,
"epoch": 0.2537142857142857,
"grad_norm": 0.02485896274447441,
"kl": 0.0015268325805664062,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0038,
"reward": -0.7272030115127563,
"reward_std": 0.2097564060240984,
"rewards/cosine_scaled_reward": -0.3844348266720772,
"rewards/format_reward": 0.0416666679084301,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 2878.6666870117188,
"epoch": 0.2542857142857143,
"grad_norm": 0.04667196050286293,
"kl": 0.002490997314453125,
"learning_rate": 1.3276726544494571e-07,
"loss": -0.071,
"reward": 0.5841437764465809,
"reward_std": 1.0845360681414604,
"rewards/cosine_scaled_reward": 0.08373854495584965,
"rewards/format_reward": 0.4166666679084301,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 2512.125,
"epoch": 0.25485714285714284,
"grad_norm": 0.04424026980996132,
"kl": 0.001781463623046875,
"learning_rate": 1.316005813502869e-07,
"loss": 0.1748,
"reward": -0.00977317988872528,
"reward_std": 0.47886658273637295,
"rewards/cosine_scaled_reward": -0.31738658621907234,
"rewards/format_reward": 0.6250000111758709,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 2817.875,
"epoch": 0.25542857142857145,
"grad_norm": 0.03891659900546074,
"kl": 0.00177001953125,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0274,
"reward": -0.3405243009328842,
"reward_std": 0.3018246004357934,
"rewards/cosine_scaled_reward": -0.2952621579170227,
"rewards/format_reward": 0.25,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 3169.2500610351562,
"epoch": 0.256,
"grad_norm": 0.02308580093085766,
"kl": 0.0024843215942382812,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.1093,
"reward": 0.33086367696523666,
"reward_std": 0.3447714298963547,
"rewards/cosine_scaled_reward": 0.08209850080311298,
"rewards/format_reward": 0.1666666679084301,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 3295.3750610351562,
"epoch": 0.25657142857142856,
"grad_norm": 0.03653385117650032,
"kl": 0.0014209747314453125,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0801,
"reward": 0.6504177749156952,
"reward_std": 0.9115674756467342,
"rewards/cosine_scaled_reward": 0.116875559091568,
"rewards/format_reward": 0.416666679084301,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 2286.9584045410156,
"epoch": 0.2571428571428571,
"grad_norm": 0.038941655308008194,
"kl": 0.002315521240234375,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.2534,
"reward": 0.6601591724902391,
"reward_std": 0.9432571902871132,
"rewards/cosine_scaled_reward": 0.038412906229496,
"rewards/format_reward": 0.5833333432674408,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 3239.4583740234375,
"epoch": 0.25771428571428573,
"grad_norm": 0.03524477407336235,
"kl": 0.0026187896728515625,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0859,
"reward": -0.48906414210796356,
"reward_std": 0.6134257167577744,
"rewards/cosine_scaled_reward": -0.3486987464129925,
"rewards/format_reward": 0.2083333358168602,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 2259.458335876465,
"epoch": 0.2582857142857143,
"grad_norm": 0.04546777531504631,
"kl": 0.0022125244140625,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0392,
"reward": 0.01176878809928894,
"reward_std": 0.3016066299751401,
"rewards/cosine_scaled_reward": -0.24411562457680702,
"rewards/format_reward": 0.5,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 2659.0416717529297,
"epoch": 0.25885714285714284,
"grad_norm": 0.10386376827955246,
"kl": 0.002735137939453125,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.367,
"reward": 0.14252938330173492,
"reward_std": 0.6242268411442637,
"rewards/cosine_scaled_reward": -0.11623530462384224,
"rewards/format_reward": 0.3750000149011612,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 3498.2083740234375,
"epoch": 0.25942857142857145,
"grad_norm": 0.02672552317380905,
"kl": 0.0025157928466796875,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0108,
"reward": 0.2536497563123703,
"reward_std": 0.29181550443172455,
"rewards/cosine_scaled_reward": 0.001824900507926941,
"rewards/format_reward": 0.25,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 2880.9583740234375,
"epoch": 0.26,
"grad_norm": 0.028729798272252083,
"kl": 0.001354217529296875,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0853,
"reward": -0.03765217214822769,
"reward_std": 0.5502043664455414,
"rewards/cosine_scaled_reward": -0.18549276888370514,
"rewards/format_reward": 0.3333333358168602,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 3449.2916870117188,
"epoch": 0.26057142857142856,
"grad_norm": 0.04774753749370575,
"kl": 0.002819061279296875,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0866,
"reward": -0.2819949258118868,
"reward_std": 0.2065523061901331,
"rewards/cosine_scaled_reward": -0.16183079779148102,
"rewards/format_reward": 0.0416666679084301,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 2905.1666717529297,
"epoch": 0.2611428571428571,
"grad_norm": 0.0340239517390728,
"kl": 0.0031032562255859375,
"learning_rate": 1.2012473704494537e-07,
"loss": -0.0145,
"reward": -0.20281050354242325,
"reward_std": 0.3757304400205612,
"rewards/cosine_scaled_reward": -0.22640525363385677,
"rewards/format_reward": 0.25,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 2852.4583435058594,
"epoch": 0.26171428571428573,
"grad_norm": 0.03048195131123066,
"kl": 0.0025997161865234375,
"learning_rate": 1.1920622611056974e-07,
"loss": -0.009,
"reward": -0.028884992003440857,
"reward_std": 0.5219508111476898,
"rewards/cosine_scaled_reward": -0.16027583554387093,
"rewards/format_reward": 0.2916666679084301,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 2762.750030517578,
"epoch": 0.2622857142857143,
"grad_norm": 0.032014597207307816,
"kl": 0.0016269683837890625,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0588,
"reward": 0.24004450626671314,
"reward_std": 0.9639499019831419,
"rewards/cosine_scaled_reward": -0.08831107523292303,
"rewards/format_reward": 0.4166666828095913,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 2788.4583740234375,
"epoch": 0.26285714285714284,
"grad_norm": 0.03341707959771156,
"kl": 0.002410888671875,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0347,
"reward": 0.3700142055749893,
"reward_std": 0.5437347143888474,
"rewards/cosine_scaled_reward": 0.01834043301641941,
"rewards/format_reward": 0.3333333358168602,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 2720.750045776367,
"epoch": 0.2634285714285714,
"grad_norm": 0.08468116074800491,
"kl": 0.0078277587890625,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.139,
"reward": 0.04922260344028473,
"reward_std": 0.6591002717614174,
"rewards/cosine_scaled_reward": -0.18372203782200813,
"rewards/format_reward": 0.4166666679084301,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 3423.9166870117188,
"epoch": 0.264,
"grad_norm": 0.02148551493883133,
"kl": 0.0011806488037109375,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0311,
"reward": 0.060897987335920334,
"reward_std": 0.43522951705381274,
"rewards/cosine_scaled_reward": -0.05288431979715824,
"rewards/format_reward": 0.1666666716337204,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 3273.666748046875,
"epoch": 0.26457142857142857,
"grad_norm": 0.02729329653084278,
"kl": 0.0020809173583984375,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0355,
"reward": -0.38024890422821045,
"reward_std": 0.5462675094604492,
"rewards/cosine_scaled_reward": -0.294291116297245,
"rewards/format_reward": 0.2083333358168602,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 2997.166717529297,
"epoch": 0.2651428571428571,
"grad_norm": 0.04993213713169098,
"kl": 0.00254058837890625,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0656,
"reward": 0.5120486691594124,
"reward_std": 0.6698486655950546,
"rewards/cosine_scaled_reward": 0.04769100248813629,
"rewards/format_reward": 0.4166666865348816,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 3465.9583740234375,
"epoch": 0.26571428571428574,
"grad_norm": 0.02075813338160515,
"kl": 0.0014781951904296875,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0302,
"reward": -0.12447906285524368,
"reward_std": 0.5544179957360029,
"rewards/cosine_scaled_reward": -0.12473953887820244,
"rewards/format_reward": 0.125,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 2555.750045776367,
"epoch": 0.2662857142857143,
"grad_norm": 0.035985130816698074,
"kl": 0.0020618438720703125,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0489,
"reward": 0.05332493036985397,
"reward_std": 0.740821436047554,
"rewards/cosine_scaled_reward": -0.24417087901383638,
"rewards/format_reward": 0.5416666753590107,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 3410.0416870117188,
"epoch": 0.26685714285714285,
"grad_norm": 0.026065455749630928,
"kl": 0.0016069412231445312,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0103,
"reward": 0.09300320036709309,
"reward_std": 0.6775085739791393,
"rewards/cosine_scaled_reward": -0.09933173656463623,
"rewards/format_reward": 0.2916666716337204,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 3171.541748046875,
"epoch": 0.2674285714285714,
"grad_norm": 0.026952385902404785,
"kl": 0.0025081634521484375,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0422,
"reward": 0.3536413833498955,
"reward_std": 0.814384575933218,
"rewards/cosine_scaled_reward": -0.010679319500923157,
"rewards/format_reward": 0.3750000037252903,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 3274.541748046875,
"epoch": 0.268,
"grad_norm": 0.036863405257463455,
"kl": 0.0023822784423828125,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.1268,
"reward": -0.6220985129475594,
"reward_std": 0.3144175373017788,
"rewards/cosine_scaled_reward": -0.3943825885653496,
"rewards/format_reward": 0.1666666679084301,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 2011.0,
"epoch": 0.26857142857142857,
"grad_norm": 0.066426120698452,
"kl": 0.0033550262451171875,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0637,
"reward": 1.0168231502175331,
"reward_std": 0.9499584957957268,
"rewards/cosine_scaled_reward": 0.21674491092562675,
"rewards/format_reward": 0.5833333358168602,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 2807.125030517578,
"epoch": 0.26914285714285713,
"grad_norm": 0.04926472529768944,
"kl": 0.0023593902587890625,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0569,
"reward": 0.29307880252599716,
"reward_std": 0.7111775614321232,
"rewards/cosine_scaled_reward": -0.10346062108874321,
"rewards/format_reward": 0.5,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 3437.3750610351562,
"epoch": 0.26971428571428574,
"grad_norm": 0.025504395365715027,
"kl": 0.0024929046630859375,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0569,
"reward": -0.4390937387943268,
"reward_std": 0.34914129227399826,
"rewards/cosine_scaled_reward": -0.2612135373055935,
"rewards/format_reward": 0.0833333358168602,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 2982.7083435058594,
"epoch": 0.2702857142857143,
"grad_norm": 0.02571857161819935,
"kl": 0.002109527587890625,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0539,
"reward": -0.3720186948776245,
"reward_std": 0.41678102128207684,
"rewards/cosine_scaled_reward": -0.3110093427821994,
"rewards/format_reward": 0.25,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 2513.7916870117188,
"epoch": 0.27085714285714285,
"grad_norm": 0.03318493440747261,
"kl": 0.0011577606201171875,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.1261,
"reward": 0.3385324701666832,
"reward_std": 0.5001957044005394,
"rewards/cosine_scaled_reward": -0.05990046262741089,
"rewards/format_reward": 0.4583333432674408,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 3129.7500610351562,
"epoch": 0.2714285714285714,
"grad_norm": 0.039005525410175323,
"kl": 0.0020847320556640625,
"learning_rate": 1.068365111445064e-07,
"loss": 0.1043,
"reward": 0.6186982914805412,
"reward_std": 0.5874023661017418,
"rewards/cosine_scaled_reward": 0.05934913083910942,
"rewards/format_reward": 0.5000000111758709,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.272,
"grad_norm": 0.02882551960647106,
"kl": 0.00218963623046875,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0001,
"reward": -0.8095465898513794,
"reward_std": 0.1690497798845172,
"rewards/cosine_scaled_reward": -0.4047732949256897,
"rewards/format_reward": 0.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 3426.875,
"epoch": 0.2725714285714286,
"grad_norm": 0.025120312348008156,
"kl": 0.001926422119140625,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0252,
"reward": -0.4069552142173052,
"reward_std": 0.6788763776421547,
"rewards/cosine_scaled_reward": -0.2868109419941902,
"rewards/format_reward": 0.1666666716337204,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.27314285714285713,
"grad_norm": 0.02181798592209816,
"kl": 0.0015773773193359375,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0001,
"reward": -0.34078748896718025,
"reward_std": 0.19651156850159168,
"rewards/cosine_scaled_reward": -0.17039374075829983,
"rewards/format_reward": 0.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 2605.5,
"epoch": 0.2737142857142857,
"grad_norm": 0.03717919439077377,
"kl": 0.002017974853515625,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0204,
"reward": 0.655741736292839,
"reward_std": 0.5535851716995239,
"rewards/cosine_scaled_reward": 0.09870417974889278,
"rewards/format_reward": 0.4583333395421505,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 3369.2500610351562,
"epoch": 0.2742857142857143,
"grad_norm": 0.03039778769016266,
"kl": 0.0029621124267578125,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0675,
"reward": 0.11506177484989166,
"reward_std": 0.895841971039772,
"rewards/cosine_scaled_reward": -0.06746910512447357,
"rewards/format_reward": 0.25,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 3044.291717529297,
"epoch": 0.27485714285714286,
"grad_norm": 0.06073984503746033,
"kl": 0.00298309326171875,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.1597,
"reward": -0.19490773417055607,
"reward_std": 0.47547537460923195,
"rewards/cosine_scaled_reward": -0.22245385870337486,
"rewards/format_reward": 0.2500000111758709,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 3030.625030517578,
"epoch": 0.2754285714285714,
"grad_norm": 0.031011393293738365,
"kl": 0.003162384033203125,
"learning_rate": 1.0354838440848501e-07,
"loss": -0.0535,
"reward": -0.1074385792016983,
"reward_std": 0.45957521721720695,
"rewards/cosine_scaled_reward": -0.1995526235550642,
"rewards/format_reward": 0.2916666679084301,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 3357.4583740234375,
"epoch": 0.276,
"grad_norm": 0.03088713437318802,
"kl": 0.003414154052734375,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0214,
"reward": -0.2319830283522606,
"reward_std": 0.731654766947031,
"rewards/cosine_scaled_reward": -0.2201581783592701,
"rewards/format_reward": 0.2083333358168602,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 2569.4167098999023,
"epoch": 0.2765714285714286,
"grad_norm": 0.059074219316244125,
"kl": 0.0023174285888671875,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.1363,
"reward": -0.241471815854311,
"reward_std": 0.2202180060558021,
"rewards/cosine_scaled_reward": -0.30823590885847807,
"rewards/format_reward": 0.375,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 3371.0,
"epoch": 0.27714285714285714,
"grad_norm": 0.028913574293255806,
"kl": 0.002979278564453125,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0252,
"reward": -0.016256675124168396,
"reward_std": 0.5143093690276146,
"rewards/cosine_scaled_reward": -0.133128359913826,
"rewards/format_reward": 0.2500000111758709,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 2765.3333587646484,
"epoch": 0.2777142857142857,
"grad_norm": 0.03141855075955391,
"kl": 0.0017871856689453125,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0405,
"reward": -0.0866716280579567,
"reward_std": 0.5073481593281031,
"rewards/cosine_scaled_reward": -0.18916914984583855,
"rewards/format_reward": 0.2916666679084301,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 2358.6250610351562,
"epoch": 0.2782857142857143,
"grad_norm": 0.05168556049466133,
"kl": 0.0022258758544921875,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.235,
"reward": 0.6061635762453079,
"reward_std": 0.9165201224386692,
"rewards/cosine_scaled_reward": 0.07391515374183655,
"rewards/format_reward": 0.4583333395421505,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 3409.8750610351562,
"epoch": 0.27885714285714286,
"grad_norm": 0.021467547863721848,
"kl": 0.0016460418701171875,
"learning_rate": 1.0157821333772304e-07,
"loss": -0.004,
"reward": -0.00885394960641861,
"reward_std": 0.7331012841314077,
"rewards/cosine_scaled_reward": -0.10859363805502653,
"rewards/format_reward": 0.2083333358168602,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 3269.6666870117188,
"epoch": 0.2794285714285714,
"grad_norm": 0.03530421480536461,
"kl": 0.00347137451171875,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0984,
"reward": -0.33968937769532204,
"reward_std": 0.504605233669281,
"rewards/cosine_scaled_reward": -0.25317803025245667,
"rewards/format_reward": 0.1666666679084301,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 2451.2500610351562,
"epoch": 0.28,
"grad_norm": 0.08236681669950485,
"kl": 0.0017137527465820312,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.3643,
"reward": 0.32231756299734116,
"reward_std": 0.8210179954767227,
"rewards/cosine_scaled_reward": -0.06800790876150131,
"rewards/format_reward": 0.4583333544433117,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 2534.541732788086,
"epoch": 0.2805714285714286,
"grad_norm": 0.05246545746922493,
"kl": 0.0018711090087890625,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.1321,
"reward": 0.22250002506189048,
"reward_std": 0.8957063853740692,
"rewards/cosine_scaled_reward": -0.11791664734482765,
"rewards/format_reward": 0.4583333358168602,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 3256.4583740234375,
"epoch": 0.28114285714285714,
"grad_norm": 0.03301486000418663,
"kl": 0.00229644775390625,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0448,
"reward": -0.07639187574386597,
"reward_std": 0.5896043181419373,
"rewards/cosine_scaled_reward": -0.1840292802080512,
"rewards/format_reward": 0.2916666716337204,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 2852.2083587646484,
"epoch": 0.2817142857142857,
"grad_norm": 0.054885730147361755,
"kl": 0.0047435760498046875,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0043,
"reward": 0.472271591424942,
"reward_std": 1.0421324595808983,
"rewards/cosine_scaled_reward": -0.013864208827726543,
"rewards/format_reward": 0.5,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 2882.791717529297,
"epoch": 0.2822857142857143,
"grad_norm": 0.029303675517439842,
"kl": 0.0027675628662109375,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0537,
"reward": 0.3263639658689499,
"reward_std": 0.7202490773051977,
"rewards/cosine_scaled_reward": -0.045151323080062866,
"rewards/format_reward": 0.4166666716337204,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 3436.1250610351562,
"epoch": 0.28285714285714286,
"grad_norm": 0.028042590245604515,
"kl": 0.0020809173583984375,
"learning_rate": 1.002741278414069e-07,
"loss": 0.045,
"reward": -0.1629636436700821,
"reward_std": 0.5904333665966988,
"rewards/cosine_scaled_reward": -0.18564849719405174,
"rewards/format_reward": 0.2083333395421505,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 2183.125030517578,
"epoch": 0.2834285714285714,
"grad_norm": 0.0886075347661972,
"kl": 0.0047855377197265625,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.1587,
"reward": 0.400937108322978,
"reward_std": 0.9074641615152359,
"rewards/cosine_scaled_reward": -0.1328647844493389,
"rewards/format_reward": 0.6666666865348816,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 3559.5416870117188,
"epoch": 0.284,
"grad_norm": 0.023150354623794556,
"kl": 0.0012216567993164062,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0102,
"reward": -0.23256949335336685,
"reward_std": 0.47016076277941465,
"rewards/cosine_scaled_reward": -0.17878474527969956,
"rewards/format_reward": 0.1250000037252903,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 2844.2083740234375,
"epoch": 0.2845714285714286,
"grad_norm": 0.04149520397186279,
"kl": 0.0020809173583984375,
"learning_rate": 1.000438641958131e-07,
"loss": 0.018,
"reward": 0.28929878026247025,
"reward_std": 0.7239453122019768,
"rewards/cosine_scaled_reward": -0.0011839456856250763,
"rewards/format_reward": 0.291666679084301,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 3106.0834350585938,
"epoch": 0.28514285714285714,
"grad_norm": 0.03673223406076431,
"kl": 0.0026187896728515625,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.1597,
"reward": 0.6660276614129543,
"reward_std": 1.4682428240776062,
"rewards/cosine_scaled_reward": 0.12468050047755241,
"rewards/format_reward": 0.416666679084301,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 2353.9166870117188,
"epoch": 0.2857142857142857,
"grad_norm": 0.06490905582904816,
"kl": 0.00217437744140625,
"learning_rate": 1e-07,
"loss": -0.1238,
"reward": 0.3091944605112076,
"reward_std": 0.7177844867110252,
"rewards/cosine_scaled_reward": -0.09540278650820255,
"rewards/format_reward": 0.5,
"step": 500
},
{
"epoch": 0.2857142857142857,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.07056569841801684,
"train_runtime": 25574.2895,
"train_samples_per_second": 0.469,
"train_steps_per_second": 0.02
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}