{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.5714285714285714,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 1644.166748046875,
      "epoch": 0.001142857142857143,
      "grad_norm": 0.20607953518495117,
      "kl": 0.0,
      "learning_rate": 2e-08,
      "loss": 0.0022,
      "reward": -0.1127668060362339,
      "reward_std": 0.20213491283357143,
      "rewards/cosine_scaled_reward": -0.18138340720906854,
      "rewards/format_reward": 0.25,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1656.791748046875,
      "epoch": 0.002285714285714286,
      "grad_norm": 0.31679714617652144,
      "kl": 0.0,
      "learning_rate": 4e-08,
      "loss": 0.0623,
      "reward": -0.05582176148891449,
      "reward_std": 0.6275629922747612,
      "rewards/cosine_scaled_reward": -0.19457754865288734,
      "rewards/format_reward": 0.3333333432674408,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1606.7500610351562,
      "epoch": 0.0034285714285714284,
      "grad_norm": 0.2789602147805501,
      "kl": 3.388524055480957e-05,
      "learning_rate": 6e-08,
      "loss": 0.0376,
      "reward": -0.2583192214369774,
      "reward_std": 0.2636854462325573,
      "rewards/cosine_scaled_reward": -0.222909614443779,
      "rewards/format_reward": 0.1875000074505806,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1690.6250610351562,
      "epoch": 0.004571428571428572,
      "grad_norm": 0.27232938747073254,
      "kl": 4.017353057861328e-05,
      "learning_rate": 8e-08,
      "loss": 0.0159,
      "reward": -0.40017254278063774,
      "reward_std": 0.17111004143953323,
      "rewards/cosine_scaled_reward": -0.3146696165204048,
      "rewards/format_reward": 0.2291666716337204,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1618.3541870117188,
      "epoch": 0.005714285714285714,
      "grad_norm": 0.2939867481096334,
      "kl": 2.8431415557861328e-05,
      "learning_rate": 1e-07,
      "loss": 0.0576,
      "reward": 0.13743871822953224,
      "reward_std": 0.7271581590175629,
      "rewards/cosine_scaled_reward": -0.12919731251895428,
      "rewards/format_reward": 0.3958333395421505,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1629.4791870117188,
      "epoch": 0.006857142857142857,
      "grad_norm": 0.248871735331751,
      "kl": 3.477931022644043e-05,
      "learning_rate": 1.2e-07,
      "loss": -0.0029,
      "reward": -0.029103130102157593,
      "reward_std": 0.5708433166146278,
      "rewards/cosine_scaled_reward": -0.1708015874028206,
      "rewards/format_reward": 0.3125000037252903,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1490.6458740234375,
      "epoch": 0.008,
      "grad_norm": 0.22790937530079167,
      "kl": 3.007054328918457e-05,
      "learning_rate": 1.4e-07,
      "loss": 0.0903,
      "reward": 0.12145921215415001,
      "reward_std": 0.5416159555315971,
      "rewards/cosine_scaled_reward": -0.10593708232045174,
      "rewards/format_reward": 0.33333334140479565,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1683.5000305175781,
      "epoch": 0.009142857142857144,
      "grad_norm": 0.20752077742039396,
      "kl": 4.646182060241699e-05,
      "learning_rate": 1.6e-07,
      "loss": 0.0277,
      "reward": -0.23692437633872032,
      "reward_std": 0.4620281979441643,
      "rewards/cosine_scaled_reward": -0.2747122012078762,
      "rewards/format_reward": 0.31250000558793545,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1719.2292175292969,
      "epoch": 0.010285714285714285,
      "grad_norm": 0.2983323511333683,
      "kl": 4.1991472244262695e-05,
      "learning_rate": 1.8e-07,
      "loss": 0.0511,
      "reward": -0.31221747025847435,
      "reward_std": 0.21310735493898392,
      "rewards/cosine_scaled_reward": -0.24985874257981777,
      "rewards/format_reward": 0.1875000074505806,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1477.2083740234375,
      "epoch": 0.011428571428571429,
      "grad_norm": 0.23645082786220448,
      "kl": 3.116577863693237e-05,
      "learning_rate": 2e-07,
      "loss": 0.0495,
      "reward": 0.37697479128837585,
      "reward_std": 0.44906593672931194,
      "rewards/cosine_scaled_reward": -0.05109592713415623,
      "rewards/format_reward": 0.4791666716337204,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1508.8958587646484,
      "epoch": 0.012571428571428572,
      "grad_norm": 0.339825377520832,
      "kl": 2.8848648071289062e-05,
      "learning_rate": 2.1999999999999998e-07,
      "loss": 0.0535,
      "reward": -0.13005081936717033,
      "reward_std": 0.6173823103308678,
      "rewards/cosine_scaled_reward": -0.2525254301726818,
      "rewards/format_reward": 0.37500000558793545,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1631.1041870117188,
      "epoch": 0.013714285714285714,
      "grad_norm": 0.20658630326267732,
      "kl": 3.084540367126465e-05,
      "learning_rate": 2.4e-07,
      "loss": 0.0635,
      "reward": 0.03064786270260811,
      "reward_std": 0.4376446008682251,
      "rewards/cosine_scaled_reward": -0.1513427309691906,
      "rewards/format_reward": 0.33333334140479565,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1422.604232788086,
      "epoch": 0.014857142857142857,
      "grad_norm": 0.23614097630983502,
      "kl": 2.527981996536255e-05,
      "learning_rate": 2.6e-07,
      "loss": -0.0306,
      "reward": 0.4512472003698349,
      "reward_std": 0.40983884781599045,
      "rewards/cosine_scaled_reward": -0.02437640482094139,
      "rewards/format_reward": 0.5,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1652.3542175292969,
      "epoch": 0.016,
      "grad_norm": 0.2206408502680819,
      "kl": 3.93986701965332e-05,
      "learning_rate": 2.8e-07,
      "loss": 0.0059,
      "reward": -0.2542928569018841,
      "reward_std": 0.17246506363153458,
      "rewards/cosine_scaled_reward": -0.26256311126053333,
      "rewards/format_reward": 0.2708333395421505,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1679.229248046875,
      "epoch": 0.017142857142857144,
      "grad_norm": 0.2314183406404789,
      "kl": 4.3898820877075195e-05,
      "learning_rate": 3e-07,
      "loss": 0.0053,
      "reward": -0.258657343685627,
      "reward_std": 0.23606499657034874,
      "rewards/cosine_scaled_reward": -0.1918286692816764,
      "rewards/format_reward": 0.125,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1396.7917175292969,
      "epoch": 0.018285714285714287,
      "grad_norm": 0.25436941656143647,
      "kl": 2.3171305656433105e-05,
      "learning_rate": 3.2e-07,
      "loss": 0.1053,
      "reward": 0.20216324925422668,
      "reward_std": 0.4999893419444561,
      "rewards/cosine_scaled_reward": -0.13850171491503716,
      "rewards/format_reward": 0.4791666716337204,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1719.416748046875,
      "epoch": 0.019428571428571427,
      "grad_norm": 0.23312894299622924,
      "kl": 4.0084123611450195e-05,
      "learning_rate": 3.4000000000000003e-07,
      "loss": -0.0007,
      "reward": -0.41149570792913437,
      "reward_std": 0.13166083209216595,
      "rewards/cosine_scaled_reward": -0.26824783720076084,
      "rewards/format_reward": 0.125,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1686.0833740234375,
      "epoch": 0.02057142857142857,
      "grad_norm": 0.24676487462788851,
      "kl": 4.7713518142700195e-05,
      "learning_rate": 3.6e-07,
      "loss": 0.0814,
      "reward": -0.32610235549509525,
      "reward_std": 0.23402154073119164,
      "rewards/cosine_scaled_reward": -0.25680116564035416,
      "rewards/format_reward": 0.18750000186264515,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1773.6458740234375,
      "epoch": 0.021714285714285714,
      "grad_norm": 0.21561964662639843,
      "kl": 2.1457672119140625e-05,
      "learning_rate": 3.7999999999999996e-07,
      "loss": 0.0164,
      "reward": -0.5961569249629974,
      "reward_std": 0.1714775264263153,
      "rewards/cosine_scaled_reward": -0.3501618057489395,
      "rewards/format_reward": 0.10416666977107525,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1529.3125610351562,
      "epoch": 0.022857142857142857,
      "grad_norm": 0.251130340260543,
      "kl": 3.24249267578125e-05,
      "learning_rate": 4e-07,
      "loss": 0.0293,
      "reward": -0.048260755836963654,
      "reward_std": 0.34835576079785824,
      "rewards/cosine_scaled_reward": -0.20121371746063232,
      "rewards/format_reward": 0.35416667722165585,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1494.6250305175781,
      "epoch": 0.024,
      "grad_norm": 0.3018968569179871,
      "kl": 2.6673078536987305e-05,
      "learning_rate": 4.1999999999999995e-07,
      "loss": 0.0278,
      "reward": 0.021329142153263092,
      "reward_std": 0.45257429778575897,
      "rewards/cosine_scaled_reward": -0.15600210055708885,
      "rewards/format_reward": 0.3333333358168602,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1778.5625610351562,
      "epoch": 0.025142857142857144,
      "grad_norm": 0.29253387654098556,
      "kl": 3.1888484954833984e-05,
      "learning_rate": 4.3999999999999997e-07,
      "loss": 0.0494,
      "reward": -0.5034094974398613,
      "reward_std": 0.3080843798816204,
      "rewards/cosine_scaled_reward": -0.29337141662836075,
      "rewards/format_reward": 0.08333333395421505,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1762.8958740234375,
      "epoch": 0.026285714285714287,
      "grad_norm": 0.21053978305274443,
      "kl": 4.506111145019531e-05,
      "learning_rate": 4.6e-07,
      "loss": 0.0144,
      "reward": -0.028878159821033478,
      "reward_std": 0.5564102046191692,
      "rewards/cosine_scaled_reward": -0.10818908177316189,
      "rewards/format_reward": 0.1875000074505806,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1352.5625305175781,
      "epoch": 0.027428571428571427,
      "grad_norm": 0.20202450012624545,
      "kl": 1.6548670828342438e-05,
      "learning_rate": 4.8e-07,
      "loss": 0.0005,
      "reward": 0.6555859744548798,
      "reward_std": 0.47822858951985836,
      "rewards/cosine_scaled_reward": 0.06737629324197769,
      "rewards/format_reward": 0.520833333954215,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1597.1875610351562,
      "epoch": 0.02857142857142857,
      "grad_norm": 0.4327230812041704,
      "kl": 3.0606985092163086e-05,
      "learning_rate": 5e-07,
      "loss": 0.0701,
      "reward": 0.05484675616025925,
      "reward_std": 0.6329891942441463,
      "rewards/cosine_scaled_reward": -0.11840994283556938,
      "rewards/format_reward": 0.29166667722165585,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1647.916748046875,
      "epoch": 0.029714285714285714,
      "grad_norm": 0.21123992049117873,
      "kl": 2.2917985916137695e-05,
      "learning_rate": 5.2e-07,
      "loss": 0.031,
      "reward": -0.24321994185447693,
      "reward_std": 0.12097731977701187,
      "rewards/cosine_scaled_reward": -0.18410997837781906,
      "rewards/format_reward": 0.125,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1638.8958740234375,
      "epoch": 0.030857142857142857,
      "grad_norm": 0.21745088219923464,
      "kl": 3.2067298889160156e-05,
      "learning_rate": 5.4e-07,
      "loss": -0.0097,
      "reward": -0.3657397888600826,
      "reward_std": 0.24539830163121223,
      "rewards/cosine_scaled_reward": -0.2974532376974821,
      "rewards/format_reward": 0.2291666716337204,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1711.2709045410156,
      "epoch": 0.032,
      "grad_norm": 0.2552233664551883,
      "kl": 2.8468668460845947e-05,
      "learning_rate": 5.6e-07,
      "loss": 0.0256,
      "reward": -0.38710537925362587,
      "reward_std": 0.2530311979353428,
      "rewards/cosine_scaled_reward": -0.2768860347568989,
      "rewards/format_reward": 0.1666666716337204,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1713.8125610351562,
      "epoch": 0.03314285714285714,
      "grad_norm": 0.202249350617508,
      "kl": 2.86102294921875e-05,
      "learning_rate": 5.8e-07,
      "loss": 0.0135,
      "reward": -0.1931730881333351,
      "reward_std": 0.5632064789533615,
      "rewards/cosine_scaled_reward": -0.20075321290642023,
      "rewards/format_reward": 0.2083333358168602,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1732.291748046875,
      "epoch": 0.03428571428571429,
      "grad_norm": 0.23328556356102392,
      "kl": 2.165883779525757e-05,
      "learning_rate": 6e-07,
      "loss": 0.0564,
      "reward": -0.3746844604611397,
      "reward_std": 0.34011659026145935,
      "rewards/cosine_scaled_reward": -0.24984224140644073,
      "rewards/format_reward": 0.12500000186264515,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1445.3125305175781,
      "epoch": 0.03542857142857143,
      "grad_norm": 0.30643607095324277,
      "kl": 3.966689109802246e-05,
      "learning_rate": 6.2e-07,
      "loss": 0.0923,
      "reward": -0.09436208941042423,
      "reward_std": 0.3265727870166302,
      "rewards/cosine_scaled_reward": -0.21384770551230758,
      "rewards/format_reward": 0.33333333395421505,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1810.7917175292969,
      "epoch": 0.036571428571428574,
      "grad_norm": 0.20484433233713875,
      "kl": 2.8021633625030518e-05,
      "learning_rate": 6.4e-07,
      "loss": 0.0202,
      "reward": -0.5034667998552322,
      "reward_std": 0.15860500000417233,
      "rewards/cosine_scaled_reward": -0.2621500678360462,
      "rewards/format_reward": 0.02083333395421505,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1750.9584045410156,
      "epoch": 0.037714285714285714,
      "grad_norm": 0.2027434434467969,
      "kl": 2.5600194931030273e-05,
      "learning_rate": 6.6e-07,
      "loss": -0.0171,
      "reward": -0.25296103954315186,
      "reward_std": 0.4817052260041237,
      "rewards/cosine_scaled_reward": -0.2514805067330599,
      "rewards/format_reward": 0.25000000558793545,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1634.8333740234375,
      "epoch": 0.038857142857142854,
      "grad_norm": 0.23764579059557195,
      "kl": 2.331659197807312e-05,
      "learning_rate": 6.800000000000001e-07,
      "loss": 0.0003,
      "reward": -0.3657361939549446,
      "reward_std": 0.2039697989821434,
      "rewards/cosine_scaled_reward": -0.25578476674854755,
      "rewards/format_reward": 0.14583333395421505,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1691.1875610351562,
      "epoch": 0.04,
      "grad_norm": 0.2390715088796384,
      "kl": 1.8522143363952637e-05,
      "learning_rate": 7e-07,
      "loss": 0.0579,
      "reward": -0.1916074175387621,
      "reward_std": 0.40257398039102554,
      "rewards/cosine_scaled_reward": -0.23122038505971432,
      "rewards/format_reward": 0.27083334885537624,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1526.2292175292969,
      "epoch": 0.04114285714285714,
      "grad_norm": 0.2361249356185026,
      "kl": 3.781914710998535e-05,
      "learning_rate": 7.2e-07,
      "loss": 0.0401,
      "reward": 0.35939645767211914,
      "reward_std": 0.39011720940470695,
      "rewards/cosine_scaled_reward": -0.01821846514940262,
      "rewards/format_reward": 0.3958333395421505,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1645.7708740234375,
      "epoch": 0.04228571428571429,
      "grad_norm": 0.26864783041008133,
      "kl": 3.820657730102539e-05,
      "learning_rate": 7.4e-07,
      "loss": 0.0746,
      "reward": -0.2870800420641899,
      "reward_std": 0.46812814101576805,
      "rewards/cosine_scaled_reward": -0.25812335684895515,
      "rewards/format_reward": 0.2291666679084301,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1722.5000610351562,
      "epoch": 0.04342857142857143,
      "grad_norm": 0.27664066975056834,
      "kl": 5.131959915161133e-05,
      "learning_rate": 7.599999999999999e-07,
      "loss": 0.0586,
      "reward": -0.15014038234949112,
      "reward_std": 0.4126087427139282,
      "rewards/cosine_scaled_reward": -0.2000702191144228,
      "rewards/format_reward": 0.2500000074505806,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1678.7083740234375,
      "epoch": 0.044571428571428574,
      "grad_norm": 0.3003829192682386,
      "kl": 4.968792200088501e-05,
      "learning_rate": 7.799999999999999e-07,
      "loss": 0.097,
      "reward": -0.21257384680211544,
      "reward_std": 0.48539142310619354,
      "rewards/cosine_scaled_reward": -0.2312869280576706,
      "rewards/format_reward": 0.2500000111758709,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1690.8958740234375,
      "epoch": 0.045714285714285714,
      "grad_norm": 0.20909108511646457,
      "kl": 5.0902366638183594e-05,
      "learning_rate": 8e-07,
      "loss": 0.0436,
      "reward": -0.5045258924365044,
      "reward_std": 0.2920587807893753,
      "rewards/cosine_scaled_reward": -0.3564296290278435,
      "rewards/format_reward": 0.2083333358168602,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1806.3334045410156,
      "epoch": 0.046857142857142854,
      "grad_norm": 0.2168555566166619,
      "kl": 3.137439489364624e-05,
      "learning_rate": 8.199999999999999e-07,
      "loss": -0.0012,
      "reward": 0.04771171510219574,
      "reward_std": 0.33250839821994305,
      "rewards/cosine_scaled_reward": -0.06989414617419243,
      "rewards/format_reward": 0.18750000186264515,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1300.6250457763672,
      "epoch": 0.048,
      "grad_norm": 0.40542845209419376,
      "kl": 0.000291675329208374,
      "learning_rate": 8.399999999999999e-07,
      "loss": 0.0768,
      "reward": 0.27488730661571026,
      "reward_std": 0.45710677094757557,
      "rewards/cosine_scaled_reward": -0.1646396858850494,
      "rewards/format_reward": 0.6041666716337204,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1705.8750610351562,
      "epoch": 0.04914285714285714,
      "grad_norm": 0.21842925663095267,
      "kl": 3.538280725479126e-05,
      "learning_rate": 8.599999999999999e-07,
      "loss": 0.0308,
      "reward": -0.2755163535475731,
      "reward_std": 0.3637393806129694,
      "rewards/cosine_scaled_reward": -0.2210915139876306,
      "rewards/format_reward": 0.1666666679084301,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1665.0625305175781,
      "epoch": 0.05028571428571429,
      "grad_norm": 0.26271417694787236,
      "kl": 0.00046503543853759766,
      "learning_rate": 8.799999999999999e-07,
      "loss": 0.073,
      "reward": -0.12092901021242142,
      "reward_std": 0.5556337833404541,
      "rewards/cosine_scaled_reward": -0.17504783952608705,
      "rewards/format_reward": 0.2291666679084301,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1733.2084045410156,
      "epoch": 0.05142857142857143,
      "grad_norm": 0.21285192669515357,
      "kl": 5.0537288188934326e-05,
      "learning_rate": 9e-07,
      "loss": 0.0423,
      "reward": -0.05799056589603424,
      "reward_std": 0.4342048391699791,
      "rewards/cosine_scaled_reward": -0.14357861876487732,
      "rewards/format_reward": 0.22916666977107525,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1640.0834045410156,
      "epoch": 0.052571428571428575,
      "grad_norm": 0.2622293688477209,
      "kl": 0.00013068318367004395,
      "learning_rate": 9.2e-07,
      "loss": 0.0317,
      "reward": -0.005384169518947601,
      "reward_std": 0.3068407401442528,
      "rewards/cosine_scaled_reward": -0.1068587563931942,
      "rewards/format_reward": 0.20833333395421505,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1498.8333892822266,
      "epoch": 0.053714285714285714,
      "grad_norm": 0.274608905827555,
      "kl": 0.0001885145902633667,
      "learning_rate": 9.399999999999999e-07,
      "loss": 0.049,
      "reward": -0.002073638141155243,
      "reward_std": 0.4514222964644432,
      "rewards/cosine_scaled_reward": -0.17812015302479267,
      "rewards/format_reward": 0.3541666753590107,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1610.4792175292969,
      "epoch": 0.054857142857142854,
      "grad_norm": 0.24771930467103717,
      "kl": 0.00015616416931152344,
      "learning_rate": 9.6e-07,
      "loss": 0.0334,
      "reward": -0.22091616783291101,
      "reward_std": 0.33334225323051214,
      "rewards/cosine_scaled_reward": -0.21462474018335342,
      "rewards/format_reward": 0.20833334140479565,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1341.1458740234375,
      "epoch": 0.056,
      "grad_norm": 0.3710205417665813,
      "kl": 0.00029793381690979004,
      "learning_rate": 9.8e-07,
      "loss": 0.0862,
      "reward": 0.40674951672554016,
      "reward_std": 0.5115297809243202,
      "rewards/cosine_scaled_reward": -0.025791920721530914,
      "rewards/format_reward": 0.45833333395421505,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1335.1667175292969,
      "epoch": 0.05714285714285714,
      "grad_norm": 0.3034272517231627,
      "kl": 0.0005925297737121582,
      "learning_rate": 1e-06,
      "loss": 0.1036,
      "reward": 0.36978277564048767,
      "reward_std": 0.4990865057334304,
      "rewards/cosine_scaled_reward": -0.033858626149594784,
      "rewards/format_reward": 0.43750002048909664,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1686.8959045410156,
      "epoch": 0.05828571428571429,
      "grad_norm": 0.3009121706411098,
      "kl": 0.00032591819763183594,
      "learning_rate": 9.999890338174275e-07,
      "loss": 0.0864,
      "reward": -0.20582207757979631,
      "reward_std": 0.5198994930833578,
      "rewards/cosine_scaled_reward": -0.19666103832423687,
      "rewards/format_reward": 0.1875000111758709,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1718.2291870117188,
      "epoch": 0.05942857142857143,
      "grad_norm": 0.21311754620957382,
      "kl": 0.0005127787590026855,
      "learning_rate": 9.999561358041868e-07,
      "loss": 0.0262,
      "reward": -0.39756081253290176,
      "reward_std": 0.34694093093276024,
      "rewards/cosine_scaled_reward": -0.2716970667243004,
      "rewards/format_reward": 0.1458333358168602,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1611.8334045410156,
      "epoch": 0.060571428571428575,
      "grad_norm": 0.22683388578373892,
      "kl": 0.0005531832575798035,
      "learning_rate": 9.999013075636804e-07,
      "loss": 0.068,
      "reward": -0.13391486555337906,
      "reward_std": 0.27848392724990845,
      "rewards/cosine_scaled_reward": -0.22320742718875408,
      "rewards/format_reward": 0.31250000186264515,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1442.0834045410156,
      "epoch": 0.061714285714285715,
      "grad_norm": 0.24769106962876689,
      "kl": 0.0002713203430175781,
      "learning_rate": 9.998245517681593e-07,
      "loss": 0.0911,
      "reward": -0.11875106766819954,
      "reward_std": 0.1542784534394741,
      "rewards/cosine_scaled_reward": -0.2572922073304653,
      "rewards/format_reward": 0.3958333432674408,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1688.4167175292969,
      "epoch": 0.06285714285714286,
      "grad_norm": 0.22851815885942953,
      "kl": 0.0001881718635559082,
      "learning_rate": 9.997258721585931e-07,
      "loss": 0.0068,
      "reward": -0.3640219047665596,
      "reward_std": 0.2585913948714733,
      "rewards/cosine_scaled_reward": -0.2965943031013012,
      "rewards/format_reward": 0.2291666753590107,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1569.4166870117188,
      "epoch": 0.064,
      "grad_norm": 0.2466081306910316,
      "kl": 0.0021448135375976562,
      "learning_rate": 9.996052735444862e-07,
      "loss": 0.096,
      "reward": -0.4589140391908586,
      "reward_std": 0.4320836700499058,
      "rewards/cosine_scaled_reward": -0.3440403640270233,
      "rewards/format_reward": 0.2291666679084301,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1629.979248046875,
      "epoch": 0.06514285714285714,
      "grad_norm": 0.22573731739546327,
      "kl": 0.0010238885879516602,
      "learning_rate": 9.994627618036452e-07,
      "loss": 0.0592,
      "reward": -0.3061641752719879,
      "reward_std": 0.5002065226435661,
      "rewards/cosine_scaled_reward": -0.26766542345285416,
      "rewards/format_reward": 0.2291666716337204,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1660.4792175292969,
      "epoch": 0.06628571428571428,
      "grad_norm": 0.22190381637143303,
      "kl": 0.0011049509048461914,
      "learning_rate": 9.992983438818915e-07,
      "loss": 0.022,
      "reward": -0.32173825055360794,
      "reward_std": 0.27725364826619625,
      "rewards/cosine_scaled_reward": -0.2754524536430836,
      "rewards/format_reward": 0.2291666679084301,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1690.0417175292969,
      "epoch": 0.06742857142857143,
      "grad_norm": 0.21914617585966853,
      "kl": 0.0010164976119995117,
      "learning_rate": 9.991120277927223e-07,
      "loss": 0.0444,
      "reward": -0.021609768271446228,
      "reward_std": 0.3677750062197447,
      "rewards/cosine_scaled_reward": -0.135804895311594,
      "rewards/format_reward": 0.25000000558793545,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1581.6875305175781,
      "epoch": 0.06857142857142857,
      "grad_norm": 0.4016735260144472,
      "kl": 0.01423954963684082,
      "learning_rate": 9.989038226169207e-07,
      "loss": 0.0192,
      "reward": 0.11502109467983246,
      "reward_std": 0.29630398005247116,
      "rewards/cosine_scaled_reward": -0.057072801515460014,
      "rewards/format_reward": 0.2291666716337204,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1475.5833740234375,
      "epoch": 0.06971428571428571,
      "grad_norm": 0.24285848407581584,
      "kl": 0.0003628730773925781,
      "learning_rate": 9.98673738502114e-07,
      "loss": 0.0731,
      "reward": 0.5937481597065926,
      "reward_std": 0.6881431620568037,
      "rewards/cosine_scaled_reward": 0.046874068677425385,
      "rewards/format_reward": 0.5000000149011612,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1805.7500610351562,
      "epoch": 0.07085714285714285,
      "grad_norm": 0.19714468440546948,
      "kl": 0.0005519390106201172,
      "learning_rate": 9.98421786662277e-07,
      "loss": 0.0172,
      "reward": -0.4636555463075638,
      "reward_std": 0.3160466430708766,
      "rewards/cosine_scaled_reward": -0.2734944522380829,
      "rewards/format_reward": 0.08333333395421505,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1329.2917175292969,
      "epoch": 0.072,
      "grad_norm": 0.28510447078335305,
      "kl": 0.004929542541503906,
      "learning_rate": 9.981479793771866e-07,
      "loss": 0.1079,
      "reward": 0.30475724674761295,
      "reward_std": 0.4675188772380352,
      "rewards/cosine_scaled_reward": -0.0976213626563549,
      "rewards/format_reward": 0.5000000149011612,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1636.5208740234375,
      "epoch": 0.07314285714285715,
      "grad_norm": 0.20815660806735267,
      "kl": 0.0003604888916015625,
      "learning_rate": 9.97852329991824e-07,
      "loss": 0.0625,
      "reward": 0.29327625688165426,
      "reward_std": 0.5610844530165195,
      "rewards/cosine_scaled_reward": -0.03044520819094032,
      "rewards/format_reward": 0.354166679084301,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1559.5000305175781,
      "epoch": 0.07428571428571429,
      "grad_norm": 0.24172417943995111,
      "kl": 0.001363515853881836,
      "learning_rate": 9.975348529157229e-07,
      "loss": 0.0995,
      "reward": 0.1283707581460476,
      "reward_std": 0.7667413726449013,
      "rewards/cosine_scaled_reward": -0.13373128045350313,
      "rewards/format_reward": 0.3958333507180214,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1729.6667175292969,
      "epoch": 0.07542857142857143,
      "grad_norm": 0.20090852438136195,
      "kl": 0.00067138671875,
      "learning_rate": 9.971955636222684e-07,
      "loss": 0.0209,
      "reward": -0.39017004892230034,
      "reward_std": 0.32542612217366695,
      "rewards/cosine_scaled_reward": -0.3200850263237953,
      "rewards/format_reward": 0.2500000149011612,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1648.7292175292969,
      "epoch": 0.07657142857142857,
      "grad_norm": 0.18795555019652113,
      "kl": 0.0007681846618652344,
      "learning_rate": 9.968344786479415e-07,
      "loss": 0.0342,
      "reward": -0.1792638599872589,
      "reward_std": 0.3578680492937565,
      "rewards/cosine_scaled_reward": -0.20421527326107025,
      "rewards/format_reward": 0.2291666679084301,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1388.5625610351562,
      "epoch": 0.07771428571428571,
      "grad_norm": 0.3904259482407812,
      "kl": 0.00202178955078125,
      "learning_rate": 9.964516155915151e-07,
      "loss": 0.0637,
      "reward": 0.16577239707112312,
      "reward_std": 0.3421984985470772,
      "rewards/cosine_scaled_reward": -0.09419714100658894,
      "rewards/format_reward": 0.3541666716337204,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1507.8333740234375,
      "epoch": 0.07885714285714286,
      "grad_norm": 0.2361059164440503,
      "kl": 0.0008258819580078125,
      "learning_rate": 9.960469931131936e-07,
      "loss": 0.0613,
      "reward": 0.17160904966294765,
      "reward_std": 0.38275655917823315,
      "rewards/cosine_scaled_reward": -0.10169548355042934,
      "rewards/format_reward": 0.37500000558793545,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1690.3750305175781,
      "epoch": 0.08,
      "grad_norm": 0.19302606573391104,
      "kl": 0.002358675003051758,
      "learning_rate": 9.956206309337066e-07,
      "loss": 0.105,
      "reward": -0.1555338129401207,
      "reward_std": 0.37855083122849464,
      "rewards/cosine_scaled_reward": -0.20276692137122154,
      "rewards/format_reward": 0.25000000186264515,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1441.729232788086,
      "epoch": 0.08114285714285714,
      "grad_norm": 0.331702227116139,
      "kl": 0.0023870468139648438,
      "learning_rate": 9.951725498333448e-07,
      "loss": 0.1388,
      "reward": -0.2453744667582214,
      "reward_std": 0.15839526243507862,
      "rewards/cosine_scaled_reward": -0.3101872429251671,
      "rewards/format_reward": 0.3750000149011612,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1497.3959045410156,
      "epoch": 0.08228571428571428,
      "grad_norm": 0.33894190686830156,
      "kl": 0.0017808079719543457,
      "learning_rate": 9.947027716509488e-07,
      "loss": 0.0553,
      "reward": 0.09824148565530777,
      "reward_std": 0.1729265321046114,
      "rewards/cosine_scaled_reward": -0.08629592880606651,
      "rewards/format_reward": 0.2708333358168602,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1444.7708892822266,
      "epoch": 0.08342857142857144,
      "grad_norm": 0.9254159035231885,
      "kl": 0.039752960205078125,
      "learning_rate": 9.942113192828444e-07,
      "loss": 0.1025,
      "reward": 0.47389062121510506,
      "reward_std": 0.7162522077560425,
      "rewards/cosine_scaled_reward": -0.05472135776653886,
      "rewards/format_reward": 0.5833333507180214,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1484.7083740234375,
      "epoch": 0.08457142857142858,
      "grad_norm": 0.2164345231616129,
      "kl": 0.0021944046020507812,
      "learning_rate": 9.93698216681727e-07,
      "loss": 0.0129,
      "reward": -0.06718481332063675,
      "reward_std": 0.16878989525139332,
      "rewards/cosine_scaled_reward": -0.22109240666031837,
      "rewards/format_reward": 0.3750000037252903,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1526.0417175292969,
      "epoch": 0.08571428571428572,
      "grad_norm": 0.3075410122107456,
      "kl": 0.00359344482421875,
      "learning_rate": 9.931634888554935e-07,
      "loss": 0.0753,
      "reward": 0.17093585059046745,
      "reward_std": 0.4688509330153465,
      "rewards/cosine_scaled_reward": -0.08119874075055122,
      "rewards/format_reward": 0.33333334513008595,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1640.4583740234375,
      "epoch": 0.08685714285714285,
      "grad_norm": 0.20492660661291412,
      "kl": 0.00046312808990478516,
      "learning_rate": 9.926071618660237e-07,
      "loss": 0.0184,
      "reward": 0.029385031666606665,
      "reward_std": 0.6126945875585079,
      "rewards/cosine_scaled_reward": -0.151974156498909,
      "rewards/format_reward": 0.33333333395421505,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1674.5625610351562,
      "epoch": 0.088,
      "grad_norm": 0.21980728108796918,
      "kl": 0.0009822845458984375,
      "learning_rate": 9.9202926282791e-07,
      "loss": -0.0002,
      "reward": -0.18806731700897217,
      "reward_std": 0.12730432488024235,
      "rewards/cosine_scaled_reward": -0.15653366968035698,
      "rewards/format_reward": 0.125,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1518.0625610351562,
      "epoch": 0.08914285714285715,
      "grad_norm": 0.242785552217566,
      "kl": 0.0009822845458984375,
      "learning_rate": 9.91429819907136e-07,
      "loss": 0.0619,
      "reward": 0.13657424598932266,
      "reward_std": 0.4360465779900551,
      "rewards/cosine_scaled_reward": -0.10879619419574738,
      "rewards/format_reward": 0.35416666977107525,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1575.4792175292969,
      "epoch": 0.09028571428571429,
      "grad_norm": 0.24080955526978698,
      "kl": 0.0005426406860351562,
      "learning_rate": 9.908088623197048e-07,
      "loss": 0.0519,
      "reward": 0.016203314065933228,
      "reward_std": 0.6479124575853348,
      "rewards/cosine_scaled_reward": -0.1585650178603828,
      "rewards/format_reward": 0.3333333395421505,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1733.9167175292969,
      "epoch": 0.09142857142857143,
      "grad_norm": 0.2186002750502081,
      "kl": 0.0005044937133789062,
      "learning_rate": 9.901664203302124e-07,
      "loss": 0.031,
      "reward": -0.5251612327992916,
      "reward_std": 0.40141166001558304,
      "rewards/cosine_scaled_reward": -0.33549728989601135,
      "rewards/format_reward": 0.1458333358168602,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1728.479248046875,
      "epoch": 0.09257142857142857,
      "grad_norm": 0.21399417944679958,
      "kl": 0.0009112358093261719,
      "learning_rate": 9.895025252503755e-07,
      "loss": 0.0374,
      "reward": -0.19506264757364988,
      "reward_std": 0.48094464652240276,
      "rewards/cosine_scaled_reward": -0.1912813438102603,
      "rewards/format_reward": 0.18750000558793545,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1601.9792175292969,
      "epoch": 0.09371428571428571,
      "grad_norm": 0.2450961734236274,
      "kl": 0.0009531974792480469,
      "learning_rate": 9.888172094375033e-07,
      "loss": 0.077,
      "reward": -0.1917775571346283,
      "reward_std": 0.5255400985479355,
      "rewards/cosine_scaled_reward": -0.25213877484202385,
      "rewards/format_reward": 0.31250001303851604,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1748.1875610351562,
      "epoch": 0.09485714285714286,
      "grad_norm": 0.22448680749018862,
      "kl": 0.0004420280456542969,
      "learning_rate": 9.881105062929221e-07,
      "loss": 0.0159,
      "reward": -0.43924427404999733,
      "reward_std": 0.2609596960246563,
      "rewards/cosine_scaled_reward": -0.27170546911656857,
      "rewards/format_reward": 0.10416666977107525,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1515.7708435058594,
      "epoch": 0.096,
      "grad_norm": 0.2231038243696207,
      "kl": 0.0006551742553710938,
      "learning_rate": 9.873824502603459e-07,
      "loss": 0.0246,
      "reward": 0.36620646342635155,
      "reward_std": 0.884237602353096,
      "rewards/cosine_scaled_reward": -0.06689677853137255,
      "rewards/format_reward": 0.5000000074505806,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1701.2083740234375,
      "epoch": 0.09714285714285714,
      "grad_norm": 0.20906161676384463,
      "kl": 0.000804901123046875,
      "learning_rate": 9.866330768241983e-07,
      "loss": 0.0555,
      "reward": -0.39954638853669167,
      "reward_std": 0.31576116755604744,
      "rewards/cosine_scaled_reward": -0.2726898640394211,
      "rewards/format_reward": 0.14583333395421505,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1610.9792175292969,
      "epoch": 0.09828571428571428,
      "grad_norm": 0.22100681278056383,
      "kl": 0.0009822845458984375,
      "learning_rate": 9.85862422507884e-07,
      "loss": 0.0444,
      "reward": -0.24343110900372267,
      "reward_std": 0.2885846998542547,
      "rewards/cosine_scaled_reward": -0.30921556800603867,
      "rewards/format_reward": 0.375,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1695.354248046875,
      "epoch": 0.09942857142857142,
      "grad_norm": 0.24683069440334848,
      "kl": 0.0029506683349609375,
      "learning_rate": 9.850705248720068e-07,
      "loss": 0.0377,
      "reward": -0.09222975745797157,
      "reward_std": 0.24668438732624054,
      "rewards/cosine_scaled_reward": -0.1502815391868353,
      "rewards/format_reward": 0.2083333432674408,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1594.9167175292969,
      "epoch": 0.10057142857142858,
      "grad_norm": 0.27215086328931853,
      "kl": 0.0016989707946777344,
      "learning_rate": 9.8425742251254e-07,
      "loss": 0.1075,
      "reward": 0.18186672404408455,
      "reward_std": 0.9013341814279556,
      "rewards/cosine_scaled_reward": -0.07573332265019417,
      "rewards/format_reward": 0.3333333432674408,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1738.7292175292969,
      "epoch": 0.10171428571428572,
      "grad_norm": 0.1946900134085172,
      "kl": 0.000820159912109375,
      "learning_rate": 9.83423155058946e-07,
      "loss": 0.0331,
      "reward": -0.28752805292606354,
      "reward_std": 0.4243736080825329,
      "rewards/cosine_scaled_reward": -0.22709737345576286,
      "rewards/format_reward": 0.16666667722165585,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1572.2916870117188,
      "epoch": 0.10285714285714286,
      "grad_norm": 0.20694868118264276,
      "kl": 0.0007328987121582031,
      "learning_rate": 9.825677631722435e-07,
      "loss": 0.0753,
      "reward": -0.08595774043351412,
      "reward_std": 0.5348180644214153,
      "rewards/cosine_scaled_reward": -0.18881220323964953,
      "rewards/format_reward": 0.2916666679084301,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1601.5625610351562,
      "epoch": 0.104,
      "grad_norm": 0.20840771038907893,
      "kl": 0.0007948875427246094,
      "learning_rate": 9.816912885430258e-07,
      "loss": 0.0808,
      "reward": -0.015035435557365417,
      "reward_std": 0.14022575318813324,
      "rewards/cosine_scaled_reward": -0.1429343856871128,
      "rewards/format_reward": 0.2708333432674408,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1498.2292175292969,
      "epoch": 0.10514285714285715,
      "grad_norm": 0.20771988001872319,
      "kl": 0.0009174346923828125,
      "learning_rate": 9.807937738894303e-07,
      "loss": 0.0994,
      "reward": 0.07728531863540411,
      "reward_std": 0.508693166077137,
      "rewards/cosine_scaled_reward": -0.1384406816214323,
      "rewards/format_reward": 0.35416666977107525,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1347.8125305175781,
      "epoch": 0.10628571428571429,
      "grad_norm": 0.27527082284418775,
      "kl": 0.0021848678588867188,
      "learning_rate": 9.798752629550546e-07,
      "loss": 0.0296,
      "reward": 0.30088429898023605,
      "reward_std": 0.5643313899636269,
      "rewards/cosine_scaled_reward": -0.10997452400624752,
      "rewards/format_reward": 0.5208333432674408,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1733.2500610351562,
      "epoch": 0.10742857142857143,
      "grad_norm": 0.23935442867120157,
      "kl": 0.0012607574462890625,
      "learning_rate": 9.78935800506826e-07,
      "loss": 0.021,
      "reward": -0.34041892923414707,
      "reward_std": 0.2469240017235279,
      "rewards/cosine_scaled_reward": -0.26395946741104126,
      "rewards/format_reward": 0.18750000186264515,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1738.5625610351562,
      "epoch": 0.10857142857142857,
      "grad_norm": 0.21273217079983556,
      "kl": 0.0006814002990722656,
      "learning_rate": 9.779754323328192e-07,
      "loss": -0.0093,
      "reward": -0.5389137789607048,
      "reward_std": 0.17841140553355217,
      "rewards/cosine_scaled_reward": -0.3423735648393631,
      "rewards/format_reward": 0.14583333395421505,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1433.9375610351562,
      "epoch": 0.10971428571428571,
      "grad_norm": 0.30691056711732384,
      "kl": 0.002574920654296875,
      "learning_rate": 9.769942052400235e-07,
      "loss": 0.137,
      "reward": 0.296867486089468,
      "reward_std": 0.3943296894431114,
      "rewards/cosine_scaled_reward": -0.04948292672634125,
      "rewards/format_reward": 0.3958333432674408,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1567.2500610351562,
      "epoch": 0.11085714285714286,
      "grad_norm": 0.25051085956589897,
      "kl": 0.0013968944549560547,
      "learning_rate": 9.759921670520634e-07,
      "loss": 0.0267,
      "reward": -0.15386457741260529,
      "reward_std": 0.37108149379491806,
      "rewards/cosine_scaled_reward": -0.21234895661473274,
      "rewards/format_reward": 0.2708333432674408,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1406.0208740234375,
      "epoch": 0.112,
      "grad_norm": 0.366560785041491,
      "kl": 0.0012578964233398438,
      "learning_rate": 9.749693666068663e-07,
      "loss": 0.099,
      "reward": 0.3372333124279976,
      "reward_std": 0.3852754198014736,
      "rewards/cosine_scaled_reward": -0.12305000983178616,
      "rewards/format_reward": 0.5833333507180214,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1598.7917175292969,
      "epoch": 0.11314285714285714,
      "grad_norm": 0.2584279138871096,
      "kl": 0.0010881423950195312,
      "learning_rate": 9.739258537542835e-07,
      "loss": 0.0536,
      "reward": 0.1023973822593689,
      "reward_std": 0.4502338841557503,
      "rewards/cosine_scaled_reward": -0.1258846465498209,
      "rewards/format_reward": 0.3541666828095913,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1557.479248046875,
      "epoch": 0.11428571428571428,
      "grad_norm": 0.23713752727518134,
      "kl": 0.0009851455688476562,
      "learning_rate": 9.728616793536587e-07,
      "loss": 0.0694,
      "reward": -0.15063253417611122,
      "reward_std": 0.3854830376803875,
      "rewards/cosine_scaled_reward": -0.23156626150012016,
      "rewards/format_reward": 0.3125000111758709,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1387.4583435058594,
      "epoch": 0.11542857142857142,
      "grad_norm": 0.32157411791816565,
      "kl": 0.001094818115234375,
      "learning_rate": 9.717768952713511e-07,
      "loss": 0.1116,
      "reward": 0.07011325657367706,
      "reward_std": 0.3243808038532734,
      "rewards/cosine_scaled_reward": -0.19411004893481731,
      "rewards/format_reward": 0.4583333395421505,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1449.3750610351562,
      "epoch": 0.11657142857142858,
      "grad_norm": 0.2168599934302549,
      "kl": 0.0015411376953125,
      "learning_rate": 9.706715543782064e-07,
      "loss": 0.0577,
      "reward": -0.21096567437052727,
      "reward_std": 0.29599858447909355,
      "rewards/cosine_scaled_reward": -0.3138161562383175,
      "rewards/format_reward": 0.4166666865348816,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1715.166748046875,
      "epoch": 0.11771428571428572,
      "grad_norm": 0.21920178674297372,
      "kl": 0.0015869140625,
      "learning_rate": 9.695457105469804e-07,
      "loss": 0.0667,
      "reward": -0.18699942529201508,
      "reward_std": 0.5092732682824135,
      "rewards/cosine_scaled_reward": -0.22891639173030853,
      "rewards/format_reward": 0.2708333395421505,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1304.4583435058594,
      "epoch": 0.11885714285714286,
      "grad_norm": 0.22942484314958453,
      "kl": 0.0013804435729980469,
      "learning_rate": 9.683994186497132e-07,
      "loss": 0.0839,
      "reward": 0.5173723250627518,
      "reward_std": 0.5176322646439075,
      "rewards/cosine_scaled_reward": -0.001730518415570259,
      "rewards/format_reward": 0.5208333358168602,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1364.8333740234375,
      "epoch": 0.12,
      "grad_norm": 0.25403433256650454,
      "kl": 0.0016727447509765625,
      "learning_rate": 9.672327345550543e-07,
      "loss": 0.1156,
      "reward": 0.28816052433103323,
      "reward_std": 0.240465197712183,
      "rewards/cosine_scaled_reward": -0.1267530769109726,
      "rewards/format_reward": 0.541666679084301,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1570.6667175292969,
      "epoch": 0.12114285714285715,
      "grad_norm": 0.2462172191203138,
      "kl": 0.0020122528076171875,
      "learning_rate": 9.66045715125541e-07,
      "loss": 0.0866,
      "reward": 0.34020555624738336,
      "reward_std": 0.7328735627233982,
      "rewards/cosine_scaled_reward": -0.038230573292821646,
      "rewards/format_reward": 0.41666666977107525,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1243.4583740234375,
      "epoch": 0.12228571428571429,
      "grad_norm": 0.22392855280151888,
      "kl": 0.001399993896484375,
      "learning_rate": 9.648384182148252e-07,
      "loss": 0.0861,
      "reward": 0.19801579043269157,
      "reward_std": 0.4772573560476303,
      "rewards/cosine_scaled_reward": -0.18224211037158966,
      "rewards/format_reward": 0.5625000149011612,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1376.5625610351562,
      "epoch": 0.12342857142857143,
      "grad_norm": 0.2328882803373465,
      "kl": 0.0032482147216796875,
      "learning_rate": 9.636109026648554e-07,
      "loss": 0.0636,
      "reward": 0.6495321169495583,
      "reward_std": 0.5899618566036224,
      "rewards/cosine_scaled_reward": 0.06434935945435427,
      "rewards/format_reward": 0.5208333488553762,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1368.0625305175781,
      "epoch": 0.12457142857142857,
      "grad_norm": 0.3696050391986309,
      "kl": 0.0028667449951171875,
      "learning_rate": 9.623632283030077e-07,
      "loss": 0.1246,
      "reward": -0.031360091641545296,
      "reward_std": 0.4002140313386917,
      "rewards/cosine_scaled_reward": -0.2656800393015146,
      "rewards/format_reward": 0.5,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1444.6666870117188,
      "epoch": 0.12571428571428572,
      "grad_norm": 0.35213532577859125,
      "kl": 0.0029430389404296875,
      "learning_rate": 9.610954559391704e-07,
      "loss": 0.1339,
      "reward": 0.6942434869706631,
      "reward_std": 0.9198908805847168,
      "rewards/cosine_scaled_reward": 0.06587174534797668,
      "rewards/format_reward": 0.5625000149011612,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1072.7708587646484,
      "epoch": 0.12685714285714286,
      "grad_norm": 0.2985726423715741,
      "kl": 0.001979827880859375,
      "learning_rate": 9.598076473627796e-07,
      "loss": 0.0476,
      "reward": 0.7408694333862513,
      "reward_std": 0.7333548963069916,
      "rewards/cosine_scaled_reward": -0.004565277136862278,
      "rewards/format_reward": 0.7500000149011612,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1633.4167175292969,
      "epoch": 0.128,
      "grad_norm": 0.22471101395696397,
      "kl": 0.00258636474609375,
      "learning_rate": 9.58499865339809e-07,
      "loss": 0.0346,
      "reward": -0.05079384706914425,
      "reward_std": 0.4366183038800955,
      "rewards/cosine_scaled_reward": -0.2337302602827549,
      "rewards/format_reward": 0.4166666865348816,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1319.7291870117188,
      "epoch": 0.12914285714285714,
      "grad_norm": 0.27063696127291986,
      "kl": 0.0033721923828125,
      "learning_rate": 9.571721736097088e-07,
      "loss": 0.0833,
      "reward": 0.6321319434791803,
      "reward_std": 0.5336715504527092,
      "rewards/cosine_scaled_reward": -0.006850697100162506,
      "rewards/format_reward": 0.6458333507180214,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1052.3958892822266,
      "epoch": 0.13028571428571428,
      "grad_norm": 0.250125198289797,
      "kl": 0.0016460418701171875,
      "learning_rate": 9.55824636882301e-07,
      "loss": 0.0768,
      "reward": 0.653087726328522,
      "reward_std": 0.35864404030144215,
      "rewards/cosine_scaled_reward": -0.017206139862537384,
      "rewards/format_reward": 0.6875000149011612,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1440.2083740234375,
      "epoch": 0.13142857142857142,
      "grad_norm": 0.29266585256345196,
      "kl": 0.0030155181884765625,
      "learning_rate": 9.54457320834625e-07,
      "loss": 0.0755,
      "reward": 0.21958831325173378,
      "reward_std": 0.704796127974987,
      "rewards/cosine_scaled_reward": -0.16103917988948524,
      "rewards/format_reward": 0.5416666865348816,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1472.5416870117188,
      "epoch": 0.13257142857142856,
      "grad_norm": 0.26433038131357134,
      "kl": 0.003147125244140625,
      "learning_rate": 9.530702921077358e-07,
      "loss": 0.073,
      "reward": 0.018861573189496994,
      "reward_std": 0.3587416708469391,
      "rewards/cosine_scaled_reward": -0.18848587945103645,
      "rewards/format_reward": 0.39583334140479565,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1545.9375305175781,
      "epoch": 0.1337142857142857,
      "grad_norm": 0.21836493727001577,
      "kl": 0.002864837646484375,
      "learning_rate": 9.516636183034564e-07,
      "loss": 0.1366,
      "reward": -0.32600877061486244,
      "reward_std": 0.43822694569826126,
      "rewards/cosine_scaled_reward": -0.35050439089536667,
      "rewards/format_reward": 0.3750000074505806,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1334.1250305175781,
      "epoch": 0.13485714285714287,
      "grad_norm": 0.24394321780710398,
      "kl": 0.0026493072509765625,
      "learning_rate": 9.502373679810839e-07,
      "loss": 0.035,
      "reward": 0.457018606364727,
      "reward_std": 0.5285698734223843,
      "rewards/cosine_scaled_reward": -0.09440736100077629,
      "rewards/format_reward": 0.645833358168602,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1220.3333740234375,
      "epoch": 0.136,
      "grad_norm": 0.28272459137828676,
      "kl": 0.0042877197265625,
      "learning_rate": 9.487916106540465e-07,
      "loss": 0.0804,
      "reward": 0.3442453145980835,
      "reward_std": 0.564174473285675,
      "rewards/cosine_scaled_reward": -0.12996070086956024,
      "rewards/format_reward": 0.6041666716337204,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1528.6042175292969,
      "epoch": 0.13714285714285715,
      "grad_norm": 0.2668307726658885,
      "kl": 0.00232696533203125,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.1032,
      "reward": -0.03986197151243687,
      "reward_std": 0.37811761628836393,
      "rewards/cosine_scaled_reward": -0.2282643192447722,
      "rewards/format_reward": 0.4166666716337204,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1584.8125610351562,
      "epoch": 0.1382857142857143,
      "grad_norm": 0.22786468100552407,
      "kl": 0.002208709716796875,
      "learning_rate": 9.458418577899774e-07,
      "loss": 0.0046,
      "reward": 0.16309459879994392,
      "reward_std": 0.2453223168849945,
      "rewards/cosine_scaled_reward": -0.1372026912868023,
      "rewards/format_reward": 0.4375000074505806,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1462.6875610351562,
      "epoch": 0.13942857142857143,
      "grad_norm": 0.28816738889821486,
      "kl": 0.00514984130859375,
      "learning_rate": 9.443380060197385e-07,
      "loss": 0.0974,
      "reward": -0.12114270869642496,
      "reward_std": 0.2534109205007553,
      "rewards/cosine_scaled_reward": -0.2689046934247017,
      "rewards/format_reward": 0.41666666977107525,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1375.7708740234375,
      "epoch": 0.14057142857142857,
      "grad_norm": 0.32101258824146217,
      "kl": 0.0041351318359375,
      "learning_rate": 9.428149347714143e-07,
      "loss": 0.1284,
      "reward": 0.18988706171512604,
      "reward_std": 0.8535008877515793,
      "rewards/cosine_scaled_reward": -0.17588980495929718,
      "rewards/format_reward": 0.541666679084301,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1374.7292175292969,
      "epoch": 0.1417142857142857,
      "grad_norm": 0.2425349865258595,
      "kl": 0.00324249267578125,
      "learning_rate": 9.412727182773486e-07,
      "loss": 0.0382,
      "reward": 0.07038946449756622,
      "reward_std": 0.49846766516566277,
      "rewards/cosine_scaled_reward": -0.15230527985841036,
      "rewards/format_reward": 0.375,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1664.8541870117188,
      "epoch": 0.14285714285714285,
      "grad_norm": 0.2457250240943947,
      "kl": 0.0025730133056640625,
      "learning_rate": 9.397114317029974e-07,
      "loss": 0.0291,
      "reward": 0.004289238480851054,
      "reward_std": 0.32331261597573757,
      "rewards/cosine_scaled_reward": -0.15410537272691727,
      "rewards/format_reward": 0.31250000186264515,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1422.3750305175781,
      "epoch": 0.144,
      "grad_norm": 0.32285843347583837,
      "kl": 0.005126953125,
      "learning_rate": 9.381311511432658e-07,
      "loss": 0.0961,
      "reward": 0.19516459852457047,
      "reward_std": 0.6147220581769943,
      "rewards/cosine_scaled_reward": -0.162834367249161,
      "rewards/format_reward": 0.5208333507180214,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1370.7708435058594,
      "epoch": 0.14514285714285713,
      "grad_norm": 0.24341515642410516,
      "kl": 0.0030078887939453125,
      "learning_rate": 9.36531953618799e-07,
      "loss": 0.0726,
      "reward": -0.08839717879891396,
      "reward_std": 0.4017263073474169,
      "rewards/cosine_scaled_reward": -0.2941986061632633,
      "rewards/format_reward": 0.5000000149011612,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1219.9167175292969,
      "epoch": 0.1462857142857143,
      "grad_norm": 0.2623858416818109,
      "kl": 0.004070281982421875,
      "learning_rate": 9.34913917072228e-07,
      "loss": 0.0537,
      "reward": 0.43044765666127205,
      "reward_std": 0.49690980464220047,
      "rewards/cosine_scaled_reward": -0.15977618098258972,
      "rewards/format_reward": 0.7500000149011612,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1109.1875305175781,
      "epoch": 0.14742857142857144,
      "grad_norm": 0.2829401049059584,
      "kl": 0.00757598876953125,
      "learning_rate": 9.332771203643714e-07,
      "loss": 0.0692,
      "reward": 0.6423492059111595,
      "reward_std": 0.4438105970621109,
      "rewards/cosine_scaled_reward": -0.03299206681549549,
      "rewards/format_reward": 0.7083333358168602,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1495.8333740234375,
      "epoch": 0.14857142857142858,
      "grad_norm": 0.23104014201895975,
      "kl": 0.00299835205078125,
      "learning_rate": 9.316216432703916e-07,
      "loss": 0.0064,
      "reward": -0.09923176001757383,
      "reward_std": 0.43960002437233925,
      "rewards/cosine_scaled_reward": -0.29961589351296425,
      "rewards/format_reward": 0.5000000149011612,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1543.2500305175781,
      "epoch": 0.14971428571428572,
      "grad_norm": 0.22132261730116032,
      "kl": 0.0033931732177734375,
      "learning_rate": 9.299475664759068e-07,
      "loss": 0.1051,
      "reward": -0.012558471411466599,
      "reward_std": 0.5053001046180725,
      "rewards/cosine_scaled_reward": -0.24586258456110954,
      "rewards/format_reward": 0.47916667722165585,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1477.0625,
      "epoch": 0.15085714285714286,
      "grad_norm": 0.2442588816427236,
      "kl": 0.004528045654296875,
      "learning_rate": 9.282549715730579e-07,
      "loss": 0.0768,
      "reward": -0.11025669425725937,
      "reward_std": 0.18197684548795223,
      "rewards/cosine_scaled_reward": -0.284295029938221,
      "rewards/format_reward": 0.4583333432674408,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1563.3958740234375,
      "epoch": 0.152,
      "grad_norm": 0.21023108591248665,
      "kl": 0.00415802001953125,
      "learning_rate": 9.265439410565328e-07,
      "loss": 0.0672,
      "reward": 0.13176406361162663,
      "reward_std": 0.5022407323122025,
      "rewards/cosine_scaled_reward": -0.18411797285079956,
      "rewards/format_reward": 0.5000000149011612,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1049.1042022705078,
      "epoch": 0.15314285714285714,
      "grad_norm": 0.3838039161390532,
      "kl": 0.00562286376953125,
      "learning_rate": 9.248145583195447e-07,
      "loss": 0.1973,
      "reward": 0.4749515192816034,
      "reward_std": 0.3580738380551338,
      "rewards/cosine_scaled_reward": -0.15835759788751602,
      "rewards/format_reward": 0.7916666865348816,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1351.5416870117188,
      "epoch": 0.15428571428571428,
      "grad_norm": 0.34500799880157473,
      "kl": 0.00400543212890625,
      "learning_rate": 9.230669076497687e-07,
      "loss": 0.143,
      "reward": 0.2647483544424176,
      "reward_std": 0.5427017770707607,
      "rewards/cosine_scaled_reward": -0.11762583442032337,
      "rewards/format_reward": 0.5000000204890966,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1168.0625305175781,
      "epoch": 0.15542857142857142,
      "grad_norm": 0.31218899888892226,
      "kl": 0.004955291748046875,
      "learning_rate": 9.213010742252327e-07,
      "loss": 0.0562,
      "reward": 0.3584494572132826,
      "reward_std": 0.5529016815125942,
      "rewards/cosine_scaled_reward": -0.17494194395840168,
      "rewards/format_reward": 0.7083333432674408,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1282.0416870117188,
      "epoch": 0.15657142857142858,
      "grad_norm": 0.2721613126225875,
      "kl": 0.007869720458984375,
      "learning_rate": 9.195171441101668e-07,
      "loss": 0.1358,
      "reward": 0.2924184873700142,
      "reward_std": 0.5777250528335571,
      "rewards/cosine_scaled_reward": -0.16629073955118656,
      "rewards/format_reward": 0.6250000055879354,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1041.8958740234375,
      "epoch": 0.15771428571428572,
      "grad_norm": 0.30890354701331296,
      "kl": 0.00521087646484375,
      "learning_rate": 9.177152042508077e-07,
      "loss": 0.0338,
      "reward": 0.860385000705719,
      "reward_std": 0.8024220168590546,
      "rewards/cosine_scaled_reward": 0.023942476138472557,
      "rewards/format_reward": 0.8125000149011612,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1192.8125457763672,
      "epoch": 0.15885714285714286,
      "grad_norm": 0.2622844914783918,
      "kl": 0.00412750244140625,
      "learning_rate": 9.158953424711624e-07,
      "loss": 0.0825,
      "reward": 0.5425689108669758,
      "reward_std": 0.5253265127539635,
      "rewards/cosine_scaled_reward": -0.12454888969659805,
      "rewards/format_reward": 0.7916666865348816,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1283.4375305175781,
      "epoch": 0.16,
      "grad_norm": 0.24897560413424463,
      "kl": 0.004375457763671875,
      "learning_rate": 9.140576474687263e-07,
      "loss": 0.1075,
      "reward": 0.3927510902285576,
      "reward_std": 0.43108681961894035,
      "rewards/cosine_scaled_reward": -0.1577911265194416,
      "rewards/format_reward": 0.7083333507180214,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1262.7916717529297,
      "epoch": 0.16114285714285714,
      "grad_norm": 0.3772239136734691,
      "kl": 0.005344390869140625,
      "learning_rate": 9.122022088101613e-07,
      "loss": 0.1713,
      "reward": 0.37745123356580734,
      "reward_std": 0.5623941943049431,
      "rewards/cosine_scaled_reward": -0.13419108092784882,
      "rewards/format_reward": 0.6458333432674408,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1246.6042022705078,
      "epoch": 0.16228571428571428,
      "grad_norm": 0.28965855619789826,
      "kl": 0.0045166015625,
      "learning_rate": 9.103291169269299e-07,
      "loss": 0.0725,
      "reward": 0.5083264335989952,
      "reward_std": 0.5853047892451286,
      "rewards/cosine_scaled_reward": -0.047920111566782,
      "rewards/format_reward": 0.6041666828095913,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1448.9375610351562,
      "epoch": 0.16342857142857142,
      "grad_norm": 0.2549900151123108,
      "kl": 0.006290435791015625,
      "learning_rate": 9.084384631108882e-07,
      "loss": 0.1142,
      "reward": 0.13985165720805526,
      "reward_std": 0.2659877985715866,
      "rewards/cosine_scaled_reward": -0.20090750604867935,
      "rewards/format_reward": 0.5416666865348816,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1203.2083740234375,
      "epoch": 0.16457142857142856,
      "grad_norm": 0.2224971436424363,
      "kl": 0.005550384521484375,
      "learning_rate": 9.065303395098358e-07,
      "loss": 0.085,
      "reward": 0.5334329381585121,
      "reward_std": 0.5584629252552986,
      "rewards/cosine_scaled_reward": -0.10828354395925999,
      "rewards/format_reward": 0.75,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1086.9792175292969,
      "epoch": 0.1657142857142857,
      "grad_norm": 0.3813865927902241,
      "kl": 0.0063629150390625,
      "learning_rate": 9.046048391230247e-07,
      "loss": 0.1879,
      "reward": 0.2875216994434595,
      "reward_std": 0.5303685888648033,
      "rewards/cosine_scaled_reward": -0.23123916238546371,
      "rewards/format_reward": 0.7500000149011612,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1285.5625610351562,
      "epoch": 0.16685714285714287,
      "grad_norm": 0.2972743546494519,
      "kl": 0.0059814453125,
      "learning_rate": 9.026620557966279e-07,
      "loss": 0.0594,
      "reward": 0.2565866466611624,
      "reward_std": 0.46598899737000465,
      "rewards/cosine_scaled_reward": -0.25712333619594574,
      "rewards/format_reward": 0.770833358168602,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 896.6250305175781,
      "epoch": 0.168,
      "grad_norm": 0.32647401434979056,
      "kl": 0.006877899169921875,
      "learning_rate": 9.007020842191634e-07,
      "loss": -0.0011,
      "reward": 1.0985181145370007,
      "reward_std": 0.5338096916675568,
      "rewards/cosine_scaled_reward": 0.05967570189386606,
      "rewards/format_reward": 0.9791666716337204,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1160.9583435058594,
      "epoch": 0.16914285714285715,
      "grad_norm": 0.2816274273158885,
      "kl": 0.00585174560546875,
      "learning_rate": 8.987250199168808e-07,
      "loss": 0.0442,
      "reward": 0.18387611024081707,
      "reward_std": 0.2959946282207966,
      "rewards/cosine_scaled_reward": -0.3143119588494301,
      "rewards/format_reward": 0.8125000149011612,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1223.9791870117188,
      "epoch": 0.1702857142857143,
      "grad_norm": 0.2823488259457116,
      "kl": 0.00612640380859375,
      "learning_rate": 8.967309592491052e-07,
      "loss": 0.0654,
      "reward": 0.47756416723132133,
      "reward_std": 0.7413289695978165,
      "rewards/cosine_scaled_reward": -0.11538459919393063,
      "rewards/format_reward": 0.7083333432674408,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 986.6042022705078,
      "epoch": 0.17142857142857143,
      "grad_norm": 0.318064277562745,
      "kl": 0.0080413818359375,
      "learning_rate": 8.9471999940354e-07,
      "loss": 0.1332,
      "reward": 0.401881605386734,
      "reward_std": 0.6674076840281487,
      "rewards/cosine_scaled_reward": -0.20530920289456844,
      "rewards/format_reward": 0.8125000298023224,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1111.4375610351562,
      "epoch": 0.17257142857142857,
      "grad_norm": 0.23750964874516883,
      "kl": 0.005405426025390625,
      "learning_rate": 8.926922383915315e-07,
      "loss": 0.0547,
      "reward": 0.42157261446118355,
      "reward_std": 0.2637167125940323,
      "rewards/cosine_scaled_reward": -0.14338038116693497,
      "rewards/format_reward": 0.7083333432674408,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1390.2292175292969,
      "epoch": 0.1737142857142857,
      "grad_norm": 0.3108688575018839,
      "kl": 0.008697509765625,
      "learning_rate": 8.906477750432903e-07,
      "loss": 0.1077,
      "reward": 0.11867762915790081,
      "reward_std": 0.5801703371107578,
      "rewards/cosine_scaled_reward": -0.2739945203065872,
      "rewards/format_reward": 0.6666666716337204,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1189.4792022705078,
      "epoch": 0.17485714285714285,
      "grad_norm": 0.22859697466435477,
      "kl": 0.006011962890625,
      "learning_rate": 8.88586709003076e-07,
      "loss": 0.0402,
      "reward": 0.46854234486818314,
      "reward_std": 0.5257667489349842,
      "rewards/cosine_scaled_reward": -0.07822884852066636,
      "rewards/format_reward": 0.6250000149011612,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1227.7292175292969,
      "epoch": 0.176,
      "grad_norm": 0.23458511838935112,
      "kl": 0.0063323974609375,
      "learning_rate": 8.865091407243394e-07,
      "loss": 0.129,
      "reward": 0.7308447554241866,
      "reward_std": 0.4724605940282345,
      "rewards/cosine_scaled_reward": 0.011255700141191483,
      "rewards/format_reward": 0.7083333432674408,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1320.6666870117188,
      "epoch": 0.17714285714285713,
      "grad_norm": 0.29059316598505575,
      "kl": 0.007198333740234375,
      "learning_rate": 8.844151714648274e-07,
      "loss": 0.1327,
      "reward": -0.1417454145848751,
      "reward_std": 0.3702365458011627,
      "rewards/cosine_scaled_reward": -0.3521227166056633,
      "rewards/format_reward": 0.5625000055879354,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1116.000015258789,
      "epoch": 0.1782857142857143,
      "grad_norm": 0.37926874198201693,
      "kl": 0.00821685791015625,
      "learning_rate": 8.823049032816478e-07,
      "loss": 0.2189,
      "reward": 0.15536441165022552,
      "reward_std": 0.2769140414893627,
      "rewards/cosine_scaled_reward": -0.2869011387228966,
      "rewards/format_reward": 0.729166679084301,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1049.5000305175781,
      "epoch": 0.17942857142857144,
      "grad_norm": 0.3728612044799926,
      "kl": 0.02156829833984375,
      "learning_rate": 8.801784390262943e-07,
      "loss": 0.0389,
      "reward": 0.7612650550436229,
      "reward_std": 0.31401624344289303,
      "rewards/cosine_scaled_reward": 0.005632489919662476,
      "rewards/format_reward": 0.75,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1386.0833740234375,
      "epoch": 0.18057142857142858,
      "grad_norm": 0.3889317879381384,
      "kl": 0.00850677490234375,
      "learning_rate": 8.780358823396352e-07,
      "loss": 0.1308,
      "reward": 0.06261628679931164,
      "reward_std": 0.3530626520514488,
      "rewards/cosine_scaled_reward": -0.3124418593943119,
      "rewards/format_reward": 0.6875000298023224,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1129.7708740234375,
      "epoch": 0.18171428571428572,
      "grad_norm": 0.3220977926530576,
      "kl": 0.00748443603515625,
      "learning_rate": 8.758773376468604e-07,
      "loss": 0.1262,
      "reward": 0.6195714063942432,
      "reward_std": 0.6696993261575699,
      "rewards/cosine_scaled_reward": -0.08604763355106115,
      "rewards/format_reward": 0.7916666865348816,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1160.4167175292969,
      "epoch": 0.18285714285714286,
      "grad_norm": 0.2845576646765754,
      "kl": 0.0069122314453125,
      "learning_rate": 8.737029101523929e-07,
      "loss": 0.1064,
      "reward": 0.6454856535419822,
      "reward_std": 0.8377318382263184,
      "rewards/cosine_scaled_reward": -0.08350718393921852,
      "rewards/format_reward": 0.8125000149011612,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1429.4792175292969,
      "epoch": 0.184,
      "grad_norm": 0.25219633802451885,
      "kl": 0.00858306884765625,
      "learning_rate": 8.715127058347614e-07,
      "loss": 0.0965,
      "reward": 0.009432412683963776,
      "reward_std": 0.3042390923947096,
      "rewards/cosine_scaled_reward": -0.27653381787240505,
      "rewards/format_reward": 0.5625000149011612,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1226.7917175292969,
      "epoch": 0.18514285714285714,
      "grad_norm": 0.26347106975524837,
      "kl": 0.0080108642578125,
      "learning_rate": 8.693068314414344e-07,
      "loss": 0.077,
      "reward": 0.24512136541306973,
      "reward_std": 0.43705228716135025,
      "rewards/cosine_scaled_reward": -0.29410600662231445,
      "rewards/format_reward": 0.8333333432674408,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 926.1250305175781,
      "epoch": 0.18628571428571428,
      "grad_norm": 0.41739039022115654,
      "kl": 0.0112457275390625,
      "learning_rate": 8.670853944836176e-07,
      "loss": 0.1827,
      "reward": 0.7628292813897133,
      "reward_std": 0.8151352852582932,
      "rewards/cosine_scaled_reward": -0.04566871002316475,
      "rewards/format_reward": 0.8541666716337204,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 946.125,
      "epoch": 0.18742857142857142,
      "grad_norm": 0.36967841595429546,
      "kl": 0.01031494140625,
      "learning_rate": 8.648485032310144e-07,
      "loss": 0.1834,
      "reward": 0.6057916302233934,
      "reward_std": 0.48515384271740913,
      "rewards/cosine_scaled_reward": -0.10335419327020645,
      "rewards/format_reward": 0.8125000298023224,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1174.4792175292969,
      "epoch": 0.18857142857142858,
      "grad_norm": 0.2708332867444507,
      "kl": 0.00788116455078125,
      "learning_rate": 8.625962667065487e-07,
      "loss": 0.0394,
      "reward": 0.32264771312475204,
      "reward_std": 0.4833778813481331,
      "rewards/cosine_scaled_reward": -0.24492615275084972,
      "rewards/format_reward": 0.8125000149011612,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 907.0833740234375,
      "epoch": 0.18971428571428572,
      "grad_norm": 0.3261002575687217,
      "kl": 0.00717926025390625,
      "learning_rate": 8.603287946810513e-07,
      "loss": 0.1283,
      "reward": 0.6173169314861298,
      "reward_std": 0.2740478292107582,
      "rewards/cosine_scaled_reward": -0.1705082282423973,
      "rewards/format_reward": 0.9583333432674408,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1270.0417175292969,
      "epoch": 0.19085714285714286,
      "grad_norm": 0.3129560409827591,
      "kl": 0.00971221923828125,
      "learning_rate": 8.580461976679099e-07,
      "loss": 0.1093,
      "reward": 0.3518100567162037,
      "reward_std": 0.5595069229602814,
      "rewards/cosine_scaled_reward": -0.1470116525888443,
      "rewards/format_reward": 0.645833358168602,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1077.1041870117188,
      "epoch": 0.192,
      "grad_norm": 0.26915582005747507,
      "kl": 0.0078125,
      "learning_rate": 8.557485869176825e-07,
      "loss": 0.1617,
      "reward": 0.2642595246434212,
      "reward_std": 0.46994560211896896,
      "rewards/cosine_scaled_reward": -0.26370356790721416,
      "rewards/format_reward": 0.7916666865348816,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1156.2500305175781,
      "epoch": 0.19314285714285714,
      "grad_norm": 0.35785552736378773,
      "kl": 0.0098724365234375,
      "learning_rate": 8.534360744126753e-07,
      "loss": 0.0922,
      "reward": 0.77548947930336,
      "reward_std": 0.7726699560880661,
      "rewards/cosine_scaled_reward": 0.0023280568420886993,
      "rewards/format_reward": 0.770833358168602,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1073.7292175292969,
      "epoch": 0.19428571428571428,
      "grad_norm": 0.32755955253118335,
      "kl": 0.0117034912109375,
      "learning_rate": 8.511087728614862e-07,
      "loss": 0.0752,
      "reward": 0.19202834740281105,
      "reward_std": 0.3850276917219162,
      "rewards/cosine_scaled_reward": -0.3206525072455406,
      "rewards/format_reward": 0.8333333432674408,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 918.8333511352539,
      "epoch": 0.19542857142857142,
      "grad_norm": 0.3616914993674,
      "kl": 0.00833892822265625,
      "learning_rate": 8.487667956935087e-07,
      "loss": 0.0904,
      "reward": 0.5478162653744221,
      "reward_std": 0.6629246398806572,
      "rewards/cosine_scaled_reward": -0.1948418878018856,
      "rewards/format_reward": 0.9375,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1036.1458740234375,
      "epoch": 0.19657142857142856,
      "grad_norm": 0.3354400822116869,
      "kl": 0.0130157470703125,
      "learning_rate": 8.464102570534061e-07,
      "loss": 0.0669,
      "reward": 0.7608658275566995,
      "reward_std": 0.6014236621558666,
      "rewards/cosine_scaled_reward": -0.04665040969848633,
      "rewards/format_reward": 0.8541666865348816,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1106.4583740234375,
      "epoch": 0.1977142857142857,
      "grad_norm": 0.3236947350770136,
      "kl": 0.0121307373046875,
      "learning_rate": 8.440392717955475e-07,
      "loss": 0.093,
      "reward": 0.7088564559817314,
      "reward_std": 0.4235651511698961,
      "rewards/cosine_scaled_reward": -0.010155089199543,
      "rewards/format_reward": 0.7291666865348816,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1109.5000305175781,
      "epoch": 0.19885714285714284,
      "grad_norm": 0.37244639543702895,
      "kl": 0.015838623046875,
      "learning_rate": 8.416539554784089e-07,
      "loss": 0.1098,
      "reward": 0.17886048182845116,
      "reward_std": 0.35543810576200485,
      "rewards/cosine_scaled_reward": -0.30640310421586037,
      "rewards/format_reward": 0.7916666865348816,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 972.9167022705078,
      "epoch": 0.2,
      "grad_norm": 0.6554774460546362,
      "kl": 0.0153045654296875,
      "learning_rate": 8.392544243589427e-07,
      "loss": 0.2068,
      "reward": 0.607050247490406,
      "reward_std": 0.4396999180316925,
      "rewards/cosine_scaled_reward": -0.14439154416322708,
      "rewards/format_reward": 0.895833358168602,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 998.2291870117188,
      "epoch": 0.20114285714285715,
      "grad_norm": 0.28748166515655293,
      "kl": 0.0133819580078125,
      "learning_rate": 8.368407953869103e-07,
      "loss": 0.0371,
      "reward": 0.486224377527833,
      "reward_std": 0.6124172061681747,
      "rewards/cosine_scaled_reward": -0.17355448007583618,
      "rewards/format_reward": 0.8333333432674408,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 916.2291870117188,
      "epoch": 0.2022857142857143,
      "grad_norm": 0.4438177799902679,
      "kl": 0.0131378173828125,
      "learning_rate": 8.344131861991828e-07,
      "loss": 0.1487,
      "reward": 0.8074519336223602,
      "reward_std": 0.4988584369421005,
      "rewards/cosine_scaled_reward": -0.05460738018155098,
      "rewards/format_reward": 0.9166666865348816,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 822.8333740234375,
      "epoch": 0.20342857142857143,
      "grad_norm": 0.5173286289503403,
      "kl": 0.0179443359375,
      "learning_rate": 8.319717151140072e-07,
      "loss": 0.1961,
      "reward": 1.0362385213375092,
      "reward_std": 0.5397170335054398,
      "rewards/cosine_scaled_reward": 0.13270257785916328,
      "rewards/format_reward": 0.770833358168602,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 959.9792022705078,
      "epoch": 0.20457142857142857,
      "grad_norm": 0.369107073779179,
      "kl": 0.016815185546875,
      "learning_rate": 8.295165011252396e-07,
      "loss": 0.1417,
      "reward": 0.6556574255228043,
      "reward_std": 0.4815560430288315,
      "rewards/cosine_scaled_reward": -0.10967130470089614,
      "rewards/format_reward": 0.8750000149011612,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1162.7292022705078,
      "epoch": 0.2057142857142857,
      "grad_norm": 0.5036563993456736,
      "kl": 0.01904296875,
      "learning_rate": 8.270476638965461e-07,
      "loss": 0.0949,
      "reward": 0.2779462654143572,
      "reward_std": 0.4615231901407242,
      "rewards/cosine_scaled_reward": -0.24644354078918695,
      "rewards/format_reward": 0.7708333432674408,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1078.8333435058594,
      "epoch": 0.20685714285714285,
      "grad_norm": 0.4317948665990577,
      "kl": 0.0135955810546875,
      "learning_rate": 8.245653237555705e-07,
      "loss": 0.1473,
      "reward": 0.6264736168086529,
      "reward_std": 0.5298948585987091,
      "rewards/cosine_scaled_reward": -0.10342983156442642,
      "rewards/format_reward": 0.8333333432674408,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1065.8125305175781,
      "epoch": 0.208,
      "grad_norm": 0.5168299485262725,
      "kl": 0.02105712890625,
      "learning_rate": 8.220696016880687e-07,
      "loss": 0.1884,
      "reward": 0.3882112614810467,
      "reward_std": 0.5859006345272064,
      "rewards/cosine_scaled_reward": -0.2017277143895626,
      "rewards/format_reward": 0.7916666865348816,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1069.4583740234375,
      "epoch": 0.20914285714285713,
      "grad_norm": 0.5024855038579699,
      "kl": 0.0205078125,
      "learning_rate": 8.195606193320136e-07,
      "loss": 0.1323,
      "reward": 0.24412129819393158,
      "reward_std": 0.47408775985240936,
      "rewards/cosine_scaled_reward": -0.2529393620789051,
      "rewards/format_reward": 0.7500000149011612,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 936.4792022705078,
      "epoch": 0.2102857142857143,
      "grad_norm": 0.4981488833418968,
      "kl": 0.017730712890625,
      "learning_rate": 8.170384989716657e-07,
      "loss": 0.137,
      "reward": 0.6930912919342518,
      "reward_std": 0.5617035925388336,
      "rewards/cosine_scaled_reward": -0.03887102263979614,
      "rewards/format_reward": 0.7708333432674408,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1030.4166717529297,
      "epoch": 0.21142857142857144,
      "grad_norm": 0.4904295101939947,
      "kl": 0.0301513671875,
      "learning_rate": 8.145033635316128e-07,
      "loss": 0.1667,
      "reward": 0.07037857547402382,
      "reward_std": 0.27715054154396057,
      "rewards/cosine_scaled_reward": -0.33981072157621384,
      "rewards/format_reward": 0.7500000298023224,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1297.3750457763672,
      "epoch": 0.21257142857142858,
      "grad_norm": 0.359329704495533,
      "kl": 0.02447509765625,
      "learning_rate": 8.119553365707802e-07,
      "loss": 0.0722,
      "reward": 0.27740756422281265,
      "reward_std": 0.35020239651203156,
      "rewards/cosine_scaled_reward": -0.20504622161388397,
      "rewards/format_reward": 0.6875000149011612,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1278.2291870117188,
      "epoch": 0.21371428571428572,
      "grad_norm": 0.6229091446373484,
      "kl": 0.037841796875,
      "learning_rate": 8.093945422764069e-07,
      "loss": 0.159,
      "reward": 0.6862413678318262,
      "reward_std": 0.806188240647316,
      "rewards/cosine_scaled_reward": -0.011045984923839569,
      "rewards/format_reward": 0.7083333507180214,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 828.7083587646484,
      "epoch": 0.21485714285714286,
      "grad_norm": 0.8396211982213951,
      "kl": 0.029296875,
      "learning_rate": 8.068211054579943e-07,
      "loss": 0.1705,
      "reward": 0.5941705331206322,
      "reward_std": 0.6708386167883873,
      "rewards/cosine_scaled_reward": -0.12999806739389896,
      "rewards/format_reward": 0.8541666865348816,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1126.9583740234375,
      "epoch": 0.216,
      "grad_norm": 1.0692586435721545,
      "kl": 0.05059814453125,
      "learning_rate": 8.04235151541222e-07,
      "loss": 0.2306,
      "reward": 0.3716874085366726,
      "reward_std": 0.6852569133043289,
      "rewards/cosine_scaled_reward": -0.17873962549492717,
      "rewards/format_reward": 0.7291666939854622,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1441.7292175292969,
      "epoch": 0.21714285714285714,
      "grad_norm": 0.4556901372243305,
      "kl": 0.0775146484375,
      "learning_rate": 8.01636806561836e-07,
      "loss": 0.0641,
      "reward": -0.02832420915365219,
      "reward_std": 0.41898399591445923,
      "rewards/cosine_scaled_reward": -0.21207877062261105,
      "rewards/format_reward": 0.3958333432674408,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1079.3541870117188,
      "epoch": 0.21828571428571428,
      "grad_norm": 0.7752155732582218,
      "kl": 0.05340576171875,
      "learning_rate": 7.990261971595048e-07,
      "loss": 0.1862,
      "reward": 0.4970630258321762,
      "reward_std": 0.6355597376823425,
      "rewards/cosine_scaled_reward": -0.1264684833586216,
      "rewards/format_reward": 0.7500000149011612,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1067.5417175292969,
      "epoch": 0.21942857142857142,
      "grad_norm": 0.9433921479755671,
      "kl": 0.0628662109375,
      "learning_rate": 7.964034505716476e-07,
      "loss": 0.1345,
      "reward": 0.34896004013717175,
      "reward_std": 0.44530968368053436,
      "rewards/cosine_scaled_reward": -0.19010332133620977,
      "rewards/format_reward": 0.7291666865348816,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1153.1458892822266,
      "epoch": 0.22057142857142858,
      "grad_norm": 0.557299045473737,
      "kl": 0.07440185546875,
      "learning_rate": 7.93768694627233e-07,
      "loss": 0.0623,
      "reward": 0.3937496952712536,
      "reward_std": 0.4528709352016449,
      "rewards/cosine_scaled_reward": -0.14687515422701836,
      "rewards/format_reward": 0.6875000149011612,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 772.7708435058594,
      "epoch": 0.22171428571428572,
      "grad_norm": 1.0195572615380695,
      "kl": 0.03753662109375,
      "learning_rate": 7.911220577405484e-07,
      "loss": 0.1994,
      "reward": 1.379511073231697,
      "reward_std": 0.604660227894783,
      "rewards/cosine_scaled_reward": 0.23142218962311745,
      "rewards/format_reward": 0.9166666865348816,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1043.2708740234375,
      "epoch": 0.22285714285714286,
      "grad_norm": 0.9603645520119819,
      "kl": 0.057830810546875,
      "learning_rate": 7.884636689049422e-07,
      "loss": 0.101,
      "reward": 0.9527463093400002,
      "reward_std": 0.651703879237175,
      "rewards/cosine_scaled_reward": 0.12220647558569908,
      "rewards/format_reward": 0.7083333432674408,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1171.8125305175781,
      "epoch": 0.224,
      "grad_norm": 1.0759043540199384,
      "kl": 0.094970703125,
      "learning_rate": 7.857936576865356e-07,
      "loss": 0.0986,
      "reward": 0.22757766395807266,
      "reward_std": 0.5421559736132622,
      "rewards/cosine_scaled_reward": -0.14662783965468407,
      "rewards/format_reward": 0.5208333488553762,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1254.6458892822266,
      "epoch": 0.22514285714285714,
      "grad_norm": 1.2281398548522355,
      "kl": 0.1163330078125,
      "learning_rate": 7.831121542179086e-07,
      "loss": 0.2334,
      "reward": 0.1120694987475872,
      "reward_std": 0.406834427267313,
      "rewards/cosine_scaled_reward": -0.21479860320687294,
      "rewards/format_reward": 0.541666679084301,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1551.9792175292969,
      "epoch": 0.22628571428571428,
      "grad_norm": 1.2807709220712407,
      "kl": 0.1573486328125,
      "learning_rate": 7.804192891917571e-07,
      "loss": 0.1642,
      "reward": 0.1520095318555832,
      "reward_std": 0.5469059012830257,
      "rewards/cosine_scaled_reward": -0.16357857827097178,
      "rewards/format_reward": 0.4791666716337204,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1243.7500457763672,
      "epoch": 0.22742857142857142,
      "grad_norm": 1.2387930807523095,
      "kl": 0.1546630859375,
      "learning_rate": 7.777151938545235e-07,
      "loss": 0.0664,
      "reward": 0.5908387266099453,
      "reward_std": 0.44286736100912094,
      "rewards/cosine_scaled_reward": 0.014169345609843731,
      "rewards/format_reward": 0.5625000149011612,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 998.3542022705078,
      "epoch": 0.22857142857142856,
      "grad_norm": 1.6258231243608119,
      "kl": 0.146240234375,
      "learning_rate": 7.75e-07,
      "loss": 0.223,
      "reward": 0.9689896870404482,
      "reward_std": 0.6490836925804615,
      "rewards/cosine_scaled_reward": 0.10949480719864368,
      "rewards/format_reward": 0.7500000298023224,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1197.5208587646484,
      "epoch": 0.2297142857142857,
      "grad_norm": 1.2117522808382983,
      "kl": 0.15203857421875,
      "learning_rate": 7.72273839962904e-07,
      "loss": 0.1108,
      "reward": 0.29535099118947983,
      "reward_std": 0.6659888252615929,
      "rewards/cosine_scaled_reward": -0.18565785279497504,
      "rewards/format_reward": 0.6666666865348816,
      "step": 201
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1035.2292022705078,
      "epoch": 0.23085714285714284,
      "grad_norm": 2.430024645446878,
      "kl": 0.1729736328125,
      "learning_rate": 7.695368466124296e-07,
      "loss": 0.1341,
      "reward": 0.4362456016242504,
      "reward_std": 0.665816992521286,
      "rewards/cosine_scaled_reward": -0.13604386523365974,
      "rewards/format_reward": 0.7083333432674408,
      "step": 202
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1285.2083587646484,
      "epoch": 0.232,
      "grad_norm": 3.5252314114631926,
      "kl": 0.2603759765625,
      "learning_rate": 7.667891533457718e-07,
      "loss": 0.2005,
      "reward": 0.48519248701632023,
      "reward_std": 0.612464651465416,
      "rewards/cosine_scaled_reward": -0.08032042533159256,
      "rewards/format_reward": 0.6458333432674408,
      "step": 203
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 835.0833587646484,
      "epoch": 0.23314285714285715,
      "grad_norm": 1.8381824332135883,
      "kl": 0.1859130859375,
      "learning_rate": 7.640308940816239e-07,
      "loss": 0.053,
      "reward": 1.2399137616157532,
      "reward_std": 0.6745168194174767,
      "rewards/cosine_scaled_reward": 0.2241235449910164,
      "rewards/format_reward": 0.7916666865348816,
      "step": 204
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1421.9583435058594,
      "epoch": 0.2342857142857143,
      "grad_norm": 1.7567396533005133,
      "kl": 0.362548828125,
      "learning_rate": 7.612622032536507e-07,
      "loss": 0.1051,
      "reward": 0.3085259608924389,
      "reward_std": 0.6349210105836391,
      "rewards/cosine_scaled_reward": -0.08532036282122135,
      "rewards/format_reward": 0.4791666865348816,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1228.7083892822266,
      "epoch": 0.23542857142857143,
      "grad_norm": 2.3389392066981562,
      "kl": 0.30615234375,
      "learning_rate": 7.584832158039378e-07,
      "loss": 0.0693,
      "reward": 0.18148453161120415,
      "reward_std": 0.5284193530678749,
      "rewards/cosine_scaled_reward": -0.24259107932448387,
      "rewards/format_reward": 0.6666666716337204,
      "step": 206
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1070.4792022705078,
      "epoch": 0.23657142857142857,
      "grad_norm": 3.543320557594463,
      "kl": 0.26416015625,
      "learning_rate": 7.556940671764124e-07,
      "loss": 0.1883,
      "reward": 0.542645301669836,
      "reward_std": 0.5379708558320999,
      "rewards/cosine_scaled_reward": -0.12451068125665188,
      "rewards/format_reward": 0.7916666716337204,
      "step": 207
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1317.2708740234375,
      "epoch": 0.2377142857142857,
      "grad_norm": 1.9754558032385148,
      "kl": 0.6748046875,
      "learning_rate": 7.528948933102438e-07,
      "loss": 0.1365,
      "reward": 0.09549727046396583,
      "reward_std": 0.3623932749032974,
      "rewards/cosine_scaled_reward": -0.2126680426299572,
      "rewards/format_reward": 0.5208333395421505,
      "step": 208
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 919.9375457763672,
      "epoch": 0.23885714285714285,
      "grad_norm": 3.305425458945869,
      "kl": 0.474609375,
      "learning_rate": 7.500858306332172e-07,
      "loss": 0.0593,
      "reward": 0.7489641904830933,
      "reward_std": 0.4507276937365532,
      "rewards/cosine_scaled_reward": 0.030732073821127415,
      "rewards/format_reward": 0.6875000149011612,
      "step": 209
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 908.8958435058594,
      "epoch": 0.24,
      "grad_norm": 3.7173678494051496,
      "kl": 0.403564453125,
      "learning_rate": 7.472670160550848e-07,
      "loss": 0.1606,
      "reward": 0.7559212893247604,
      "reward_std": 0.5382421165704727,
      "rewards/cosine_scaled_reward": -0.007456040009856224,
      "rewards/format_reward": 0.7708333432674408,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1479.0416870117188,
      "epoch": 0.24114285714285713,
      "grad_norm": 39.96198653082631,
      "kl": 2.5693359375,
      "learning_rate": 7.444385869608921e-07,
      "loss": 0.2707,
      "reward": 0.03475058265030384,
      "reward_std": 0.3246455695480108,
      "rewards/cosine_scaled_reward": -0.18054138123989105,
      "rewards/format_reward": 0.39583333395421505,
      "step": 211
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1051.0208587646484,
      "epoch": 0.2422857142857143,
      "grad_norm": 3.2393755765757777,
      "kl": 0.53466796875,
      "learning_rate": 7.416006812042827e-07,
      "loss": 0.0958,
      "reward": 0.6123923324048519,
      "reward_std": 0.5387515500187874,
      "rewards/cosine_scaled_reward": -0.04797050543129444,
      "rewards/format_reward": 0.708333358168602,
      "step": 212
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1301.3750610351562,
      "epoch": 0.24342857142857144,
      "grad_norm": 2.65733014184082,
      "kl": 0.755859375,
      "learning_rate": 7.387534371007797e-07,
      "loss": 0.1374,
      "reward": 0.1711240354925394,
      "reward_std": 0.42111407220363617,
      "rewards/cosine_scaled_reward": -0.16443797945976257,
      "rewards/format_reward": 0.5000000149011612,
      "step": 213
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1203.6875610351562,
      "epoch": 0.24457142857142858,
      "grad_norm": 2.501952170306742,
      "kl": 0.50732421875,
      "learning_rate": 7.358969934210438e-07,
      "loss": 0.1105,
      "reward": 0.22278533224016428,
      "reward_std": 0.434869222342968,
      "rewards/cosine_scaled_reward": -0.22194067016243935,
      "rewards/format_reward": 0.6666667014360428,
      "step": 214
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1249.6667022705078,
      "epoch": 0.24571428571428572,
      "grad_norm": 4.086485386572322,
      "kl": 0.880859375,
      "learning_rate": 7.330314893841101e-07,
      "loss": 0.0173,
      "reward": 0.3316160347312689,
      "reward_std": 0.5279753059148788,
      "rewards/cosine_scaled_reward": -0.14669198356568813,
      "rewards/format_reward": 0.6250000223517418,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1369.1666870117188,
      "epoch": 0.24685714285714286,
      "grad_norm": 3.328918162087878,
      "kl": 0.773193359375,
      "learning_rate": 7.301570646506027e-07,
      "loss": 0.1402,
      "reward": 0.2145287273451686,
      "reward_std": 0.5796768814325333,
      "rewards/cosine_scaled_reward": -0.16356897167861462,
      "rewards/format_reward": 0.5416666865348816,
      "step": 216
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1269.8542175292969,
      "epoch": 0.248,
      "grad_norm": 2.8333189883709515,
      "kl": 0.75927734375,
      "learning_rate": 7.27273859315928e-07,
      "loss": -0.0115,
      "reward": 0.5310591869056225,
      "reward_std": 0.4825605973601341,
      "rewards/cosine_scaled_reward": -0.057387083768844604,
      "rewards/format_reward": 0.645833358168602,
      "step": 217
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1252.2708740234375,
      "epoch": 0.24914285714285714,
      "grad_norm": 4.762778702423241,
      "kl": 0.74072265625,
      "learning_rate": 7.243820139034464e-07,
      "loss": 0.1477,
      "reward": 0.5015929639339447,
      "reward_std": 0.3994259871542454,
      "rewards/cosine_scaled_reward": -0.07212021434679627,
      "rewards/format_reward": 0.6458333507180214,
      "step": 218
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1017.5416870117188,
      "epoch": 0.2502857142857143,
      "grad_norm": 4.164501369060878,
      "kl": 1.07958984375,
      "learning_rate": 7.214816693576234e-07,
      "loss": 0.1337,
      "reward": 0.767455330118537,
      "reward_std": 0.5030167028307915,
      "rewards/cosine_scaled_reward": 0.07122766599059105,
      "rewards/format_reward": 0.6250000223517418,
      "step": 219
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1053.1875457763672,
      "epoch": 0.25142857142857145,
      "grad_norm": 3.588996799420188,
      "kl": 0.71142578125,
      "learning_rate": 7.185729670371604e-07,
      "loss": 0.1866,
      "reward": 0.48609594255685806,
      "reward_std": 0.617650680243969,
      "rewards/cosine_scaled_reward": -0.11111870361492038,
      "rewards/format_reward": 0.7083333432674408,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1244.687515258789,
      "epoch": 0.25257142857142856,
      "grad_norm": 2.946733537468475,
      "kl": 1.330078125,
      "learning_rate": 7.156560487081051e-07,
      "loss": 0.1268,
      "reward": 0.4570632018148899,
      "reward_std": 0.36856189370155334,
      "rewards/cosine_scaled_reward": -0.0006350576877593994,
      "rewards/format_reward": 0.4583333395421505,
      "step": 221
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1376.9167175292969,
      "epoch": 0.2537142857142857,
      "grad_norm": 3.53418042013775,
      "kl": 1.1337890625,
      "learning_rate": 7.127310565369415e-07,
      "loss": 0.2362,
      "reward": 0.1362705221399665,
      "reward_std": 0.3934030085802078,
      "rewards/cosine_scaled_reward": -0.19228141009807587,
      "rewards/format_reward": 0.5208333358168602,
      "step": 222
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1156.750015258789,
      "epoch": 0.25485714285714284,
      "grad_norm": 35.23833796360462,
      "kl": 2.369140625,
      "learning_rate": 7.097981330836616e-07,
      "loss": 0.1765,
      "reward": 0.6305188983678818,
      "reward_std": 0.5979669764637947,
      "rewards/cosine_scaled_reward": 0.023592765908688307,
      "rewards/format_reward": 0.5833333507180214,
      "step": 223
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1251.3125305175781,
      "epoch": 0.256,
      "grad_norm": 3.4418620220945138,
      "kl": 1.376953125,
      "learning_rate": 7.068574212948169e-07,
      "loss": 0.1723,
      "reward": 0.5104624545201659,
      "reward_std": 0.25178899243474007,
      "rewards/cosine_scaled_reward": -0.06768545880913734,
      "rewards/format_reward": 0.6458333432674408,
      "step": 224
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 924.3541870117188,
      "epoch": 0.2571428571428571,
      "grad_norm": 6.348797231777103,
      "kl": 0.9375,
      "learning_rate": 7.039090644965509e-07,
      "loss": 0.1337,
      "reward": 0.7791457176208496,
      "reward_std": 0.7603946030139923,
      "rewards/cosine_scaled_reward": 0.07707285927608609,
      "rewards/format_reward": 0.6250000223517418,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1052.5000305175781,
      "epoch": 0.2582857142857143,
      "grad_norm": 3.9386080018485288,
      "kl": 1.52734375,
      "learning_rate": 7.009532063876148e-07,
      "loss": 0.2459,
      "reward": 0.46499455720186234,
      "reward_std": 0.6090477257966995,
      "rewards/cosine_scaled_reward": -0.09041939489543438,
      "rewards/format_reward": 0.6458333432674408,
      "step": 226
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 892.1875152587891,
      "epoch": 0.25942857142857145,
      "grad_norm": 3.4313724086317445,
      "kl": 1.125,
      "learning_rate": 6.979899910323624e-07,
      "loss": 0.1959,
      "reward": 0.5925753712654114,
      "reward_std": 0.8098603934049606,
      "rewards/cosine_scaled_reward": -0.04746231180615723,
      "rewards/format_reward": 0.6875000298023224,
      "step": 227
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1091.6458740234375,
      "epoch": 0.26057142857142856,
      "grad_norm": 4.447647427267497,
      "kl": 1.66015625,
      "learning_rate": 6.950195628537299e-07,
      "loss": 0.1179,
      "reward": 0.24639339372515678,
      "reward_std": 0.48318010196089745,
      "rewards/cosine_scaled_reward": -0.17888664733618498,
      "rewards/format_reward": 0.6041666865348816,
      "step": 228
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 809.2916870117188,
      "epoch": 0.26171428571428573,
      "grad_norm": 8.169532609480521,
      "kl": 2.046875,
      "learning_rate": 6.920420666261961e-07,
      "loss": 0.3082,
      "reward": 0.5617873594164848,
      "reward_std": 0.7489510700106621,
      "rewards/cosine_scaled_reward": -0.07327299565076828,
      "rewards/format_reward": 0.7083333432674408,
      "step": 229
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1110.3542175292969,
      "epoch": 0.26285714285714284,
      "grad_norm": 2.921180843223507,
      "kl": 2.2265625,
      "learning_rate": 6.890576474687263e-07,
      "loss": 0.1487,
      "reward": 0.4394577872008085,
      "reward_std": 0.4748491495847702,
      "rewards/cosine_scaled_reward": -0.05110444873571396,
      "rewards/format_reward": 0.5416666977107525,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1026.3542175292969,
      "epoch": 0.264,
      "grad_norm": 2.544177744090501,
      "kl": 1.572265625,
      "learning_rate": 6.860664508377001e-07,
      "loss": 0.1564,
      "reward": 0.2407762985676527,
      "reward_std": 0.5902754589915276,
      "rewards/cosine_scaled_reward": -0.20252852141857147,
      "rewards/format_reward": 0.645833358168602,
      "step": 231
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1030.8958587646484,
      "epoch": 0.2651428571428571,
      "grad_norm": 3.5304119337525526,
      "kl": 1.529296875,
      "learning_rate": 6.83068622519821e-07,
      "loss": 0.1109,
      "reward": 0.42541009094566107,
      "reward_std": 0.6807678937911987,
      "rewards/cosine_scaled_reward": -0.11021162755787373,
      "rewards/format_reward": 0.6458333507180214,
      "step": 232
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1073.3333435058594,
      "epoch": 0.2662857142857143,
      "grad_norm": 3.0267711493511382,
      "kl": 1.1796875,
      "learning_rate": 6.800643086250121e-07,
      "loss": 0.2702,
      "reward": 0.42545080557465553,
      "reward_std": 0.48426005244255066,
      "rewards/cosine_scaled_reward": -0.15185793861746788,
      "rewards/format_reward": 0.7291666716337204,
      "step": 233
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1166.9791870117188,
      "epoch": 0.2674285714285714,
      "grad_norm": 2.956369605796136,
      "kl": 1.1279296875,
      "learning_rate": 6.770536555792944e-07,
      "loss": 0.1076,
      "reward": 0.3714570254087448,
      "reward_std": 0.650765061378479,
      "rewards/cosine_scaled_reward": -0.13718816195614636,
      "rewards/format_reward": 0.645833358168602,
      "step": 234
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1054.6667175292969,
      "epoch": 0.26857142857142857,
      "grad_norm": 4.47554265499188,
      "kl": 1.21484375,
      "learning_rate": 6.740368101176495e-07,
      "loss": 0.2849,
      "reward": 0.6623743935488164,
      "reward_std": 0.7155829221010208,
      "rewards/cosine_scaled_reward": -0.012562822550535202,
      "rewards/format_reward": 0.6875000223517418,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1096.1875457763672,
      "epoch": 0.26971428571428574,
      "grad_norm": 4.925975683565178,
      "kl": 1.3408203125,
      "learning_rate": 6.710139192768694e-07,
      "loss": 0.2351,
      "reward": 0.26786297000944614,
      "reward_std": 0.5117842257022858,
      "rewards/cosine_scaled_reward": -0.2202351950109005,
      "rewards/format_reward": 0.708333358168602,
      "step": 236
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 983.2291870117188,
      "epoch": 0.27085714285714285,
      "grad_norm": 2.226077510557553,
      "kl": 0.77294921875,
      "learning_rate": 6.679851303883891e-07,
      "loss": 0.1527,
      "reward": 0.5171467587351799,
      "reward_std": 0.5790724456310272,
      "rewards/cosine_scaled_reward": -0.10600997135043144,
      "rewards/format_reward": 0.7291666865348816,
      "step": 237
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1015.9167175292969,
      "epoch": 0.272,
      "grad_norm": 2.746018994596942,
      "kl": 1.0703125,
      "learning_rate": 6.649505910711058e-07,
      "loss": 0.1685,
      "reward": 0.4093864783644676,
      "reward_std": 0.5853541940450668,
      "rewards/cosine_scaled_reward": -0.1911400929093361,
      "rewards/format_reward": 0.7916667014360428,
      "step": 238
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1138.8542022705078,
      "epoch": 0.27314285714285713,
      "grad_norm": 2.366422791383297,
      "kl": 1.3916015625,
      "learning_rate": 6.619104492241847e-07,
      "loss": 0.1319,
      "reward": 0.03224743437021971,
      "reward_std": 0.40017952769994736,
      "rewards/cosine_scaled_reward": -0.2963762879371643,
      "rewards/format_reward": 0.6250000298023224,
      "step": 239
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 916.8125305175781,
      "epoch": 0.2742857142857143,
      "grad_norm": 1.7577643969871468,
      "kl": 1.291015625,
      "learning_rate": 6.588648530198504e-07,
      "loss": 0.13,
      "reward": 0.8863477371633053,
      "reward_std": 0.6274040639400482,
      "rewards/cosine_scaled_reward": 0.10984052997082472,
      "rewards/format_reward": 0.6666666865348816,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 891.0417022705078,
      "epoch": 0.2754285714285714,
      "grad_norm": 2.841473966918375,
      "kl": 1.0361328125,
      "learning_rate": 6.558139508961654e-07,
      "loss": 0.1554,
      "reward": 0.48904264718294144,
      "reward_std": 0.669127531349659,
      "rewards/cosine_scaled_reward": -0.16172868385910988,
      "rewards/format_reward": 0.8125000149011612,
      "step": 241
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 900.8541870117188,
      "epoch": 0.2765714285714286,
      "grad_norm": 4.202915193648642,
      "kl": 0.96337890625,
      "learning_rate": 6.527578915497951e-07,
      "loss": 0.1132,
      "reward": 0.6491687893867493,
      "reward_std": 0.6397206410765648,
      "rewards/cosine_scaled_reward": -0.08166561461985111,
      "rewards/format_reward": 0.8125000149011612,
      "step": 242
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 871.3542022705078,
      "epoch": 0.2777142857142857,
      "grad_norm": 4.013401867872089,
      "kl": 1.2275390625,
      "learning_rate": 6.496968239287603e-07,
      "loss": 0.0343,
      "reward": 0.6437305957078934,
      "reward_std": 0.566775843501091,
      "rewards/cosine_scaled_reward": -0.06355137238278985,
      "rewards/format_reward": 0.7708333432674408,
      "step": 243
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1051.8541870117188,
      "epoch": 0.27885714285714286,
      "grad_norm": 2.0640323982742346,
      "kl": 1.2119140625,
      "learning_rate": 6.466308972251785e-07,
      "loss": 0.1283,
      "reward": 0.6993502229452133,
      "reward_std": 0.8381707072257996,
      "rewards/cosine_scaled_reward": -0.04615823458880186,
      "rewards/format_reward": 0.7916667014360428,
      "step": 244
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 926.8958587646484,
      "epoch": 0.28,
      "grad_norm": 2.3095581027269456,
      "kl": 1.2373046875,
      "learning_rate": 6.435602608679916e-07,
      "loss": 0.1728,
      "reward": 0.5032865107059479,
      "reward_std": 0.4741464629769325,
      "rewards/cosine_scaled_reward": -0.15460674837231636,
      "rewards/format_reward": 0.8125000298023224,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 948.7292022705078,
      "epoch": 0.28114285714285714,
      "grad_norm": 2.2705966167509697,
      "kl": 1.0166015625,
      "learning_rate": 6.404850645156841e-07,
      "loss": 0.0879,
      "reward": 0.5439350083470345,
      "reward_std": 0.6458217911422253,
      "rewards/cosine_scaled_reward": -0.11344920098781586,
      "rewards/format_reward": 0.770833358168602,
      "step": 246
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 766.6250152587891,
      "epoch": 0.2822857142857143,
      "grad_norm": 4.218176679768865,
      "kl": 1.375,
      "learning_rate": 6.374054580489873e-07,
      "loss": 0.1529,
      "reward": 0.7583817802369595,
      "reward_std": 0.9407426938414574,
      "rewards/cosine_scaled_reward": 0.02502422034740448,
      "rewards/format_reward": 0.7083333432674408,
      "step": 247
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1149.2708435058594,
      "epoch": 0.2834285714285714,
      "grad_norm": 2.966316254338991,
      "kl": 1.69921875,
      "learning_rate": 6.343215915635761e-07,
      "loss": 0.1307,
      "reward": 0.37028552405536175,
      "reward_std": 0.35450038872659206,
      "rewards/cosine_scaled_reward": -0.15860724076628685,
      "rewards/format_reward": 0.6875000298023224,
      "step": 248
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1236.3750305175781,
      "epoch": 0.2845714285714286,
      "grad_norm": 2.8644099570080126,
      "kl": 1.646484375,
      "learning_rate": 6.31233615362752e-07,
      "loss": 0.142,
      "reward": 0.3449726775288582,
      "reward_std": 0.7856429815292358,
      "rewards/cosine_scaled_reward": -0.09834698960185051,
      "rewards/format_reward": 0.5416666865348816,
      "step": 249
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 977.7500305175781,
      "epoch": 0.2857142857142857,
      "grad_norm": 1.9099821609277308,
      "kl": 0.921875,
      "learning_rate": 6.281416799501187e-07,
      "loss": 0.0404,
      "reward": 0.6945669716224074,
      "reward_std": 0.822948083281517,
      "rewards/cosine_scaled_reward": -0.048549871891736984,
      "rewards/format_reward": 0.7916666716337204,
      "step": 250
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1265.9583740234375,
      "epoch": 0.28685714285714287,
      "grad_norm": 2.751476452748249,
      "kl": 1.216796875,
      "learning_rate": 1.000438641958131e-07,
      "loss": 0.1111,
      "reward": 0.12667130306363106,
      "reward_std": 0.7467320710420609,
      "rewards/cosine_scaled_reward": -0.17624769732356071,
      "rewards/format_reward": 0.4791666865348816,
      "step": 251
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1031.0833740234375,
      "epoch": 0.288,
      "grad_norm": 3.701835452468544,
      "kl": 1.033203125,
      "learning_rate": 6.219465344613258e-07,
      "loss": 0.2332,
      "reward": 0.3126375643769279,
      "reward_std": 0.748970627784729,
      "rewards/cosine_scaled_reward": -0.09368122089654207,
      "rewards/format_reward": 0.5000000074505806,
      "step": 252
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 889.5833587646484,
      "epoch": 0.28914285714285715,
      "grad_norm": 5.141640270028422,
      "kl": 1.69921875,
      "learning_rate": 6.188436263278172e-07,
      "loss": -0.1188,
      "reward": 0.23392239259555936,
      "reward_std": 0.8090809062123299,
      "rewards/cosine_scaled_reward": -0.11220548488199711,
      "rewards/format_reward": 0.4583333432674408,
      "step": 253
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 912.5208587646484,
      "epoch": 0.29028571428571426,
      "grad_norm": 3.5136083178201183,
      "kl": 1.1953125,
      "learning_rate": 6.157373628530852e-07,
      "loss": 0.1793,
      "reward": 0.7197382766753435,
      "reward_std": 0.9268201515078545,
      "rewards/cosine_scaled_reward": 0.057785794138908386,
      "rewards/format_reward": 0.6041666865348816,
      "step": 254
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1043.3333740234375,
      "epoch": 0.2914285714285714,
      "grad_norm": 2.8576463073310023,
      "kl": 1.361328125,
      "learning_rate": 6.126278954320294e-07,
      "loss": 0.1618,
      "reward": 0.21097473427653313,
      "reward_std": 0.8950171619653702,
      "rewards/cosine_scaled_reward": -0.08201263658702374,
      "rewards/format_reward": 0.3750000074505806,
      "step": 255
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1132.9166870117188,
      "epoch": 0.2925714285714286,
      "grad_norm": 2.6390372016890877,
      "kl": 0.9296875,
      "learning_rate": 6.095153756157051e-07,
      "loss": 0.1517,
      "reward": 0.3409617803990841,
      "reward_std": 0.7687749713659286,
      "rewards/cosine_scaled_reward": -0.142019122838974,
      "rewards/format_reward": 0.6250000149011612,
      "step": 256
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1172.1458740234375,
      "epoch": 0.2937142857142857,
      "grad_norm": 1.7999790033387904,
      "kl": 0.8994140625,
      "learning_rate": 6.06399955103937e-07,
      "loss": 0.0345,
      "reward": 0.24714069813489914,
      "reward_std": 0.526521310210228,
      "rewards/cosine_scaled_reward": -0.20976299978792667,
      "rewards/format_reward": 0.6666667014360428,
      "step": 257
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1025.2292022705078,
      "epoch": 0.2948571428571429,
      "grad_norm": 3.7817000702854284,
      "kl": 0.9970703125,
      "learning_rate": 6.032817857379256e-07,
      "loss": 0.0254,
      "reward": 0.371606208384037,
      "reward_std": 0.8782027065753937,
      "rewards/cosine_scaled_reward": -0.10586357489228249,
      "rewards/format_reward": 0.583333358168602,
      "step": 258
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1027.1041870117188,
      "epoch": 0.296,
      "grad_norm": 2.2007546083055627,
      "kl": 1.23828125,
      "learning_rate": 6.001610194928464e-07,
      "loss": 0.1329,
      "reward": 0.2863161154091358,
      "reward_std": 0.6974881812930107,
      "rewards/cosine_scaled_reward": -0.16934195160865784,
      "rewards/format_reward": 0.6250000149011612,
      "step": 259
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1060.1666870117188,
      "epoch": 0.29714285714285715,
      "grad_norm": 2.0712856185453226,
      "kl": 1.314453125,
      "learning_rate": 5.97037808470444e-07,
      "loss": -0.0031,
      "reward": 0.05191618762910366,
      "reward_std": 0.5254812240600586,
      "rewards/cosine_scaled_reward": -0.1927919089794159,
      "rewards/format_reward": 0.4375000074505806,
      "step": 260
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 800.5625152587891,
      "epoch": 0.29828571428571427,
      "grad_norm": 3.953323642394609,
      "kl": 1.18359375,
      "learning_rate": 5.939123048916173e-07,
      "loss": 0.1926,
      "reward": 0.16135332686826587,
      "reward_std": 0.6497361660003662,
      "rewards/cosine_scaled_reward": -0.21099001914262772,
      "rewards/format_reward": 0.5833333432674408,
      "step": 261
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 906.3542022705078,
      "epoch": 0.29942857142857143,
      "grad_norm": 6.975231366994329,
      "kl": 1.1025390625,
      "learning_rate": 5.907846610890011e-07,
      "loss": 0.2163,
      "reward": 0.13131073210388422,
      "reward_std": 0.5159479975700378,
      "rewards/cosine_scaled_reward": -0.1739279804751277,
      "rewards/format_reward": 0.47916667722165585,
      "step": 262
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 981.3958587646484,
      "epoch": 0.30057142857142854,
      "grad_norm": 3.6462739135853304,
      "kl": 0.93359375,
      "learning_rate": 5.87655029499542e-07,
      "loss": 0.2144,
      "reward": 0.2528093755245209,
      "reward_std": 0.6878427565097809,
      "rewards/cosine_scaled_reward": -0.19651199039071798,
      "rewards/format_reward": 0.645833358168602,
      "step": 263
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1036.4792022705078,
      "epoch": 0.3017142857142857,
      "grad_norm": 2.4186369761638797,
      "kl": 1.11328125,
      "learning_rate": 5.845235626570683e-07,
      "loss": 0.0094,
      "reward": 0.34765794809209183,
      "reward_std": 0.7917995601892471,
      "rewards/cosine_scaled_reward": -0.10742103308439255,
      "rewards/format_reward": 0.5625000149011612,
      "step": 264
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 988.1458587646484,
      "epoch": 0.3028571428571429,
      "grad_norm": 3.8358402184782845,
      "kl": 1.125,
      "learning_rate": 5.813904131848564e-07,
      "loss": 0.1412,
      "reward": 0.22985844686627388,
      "reward_std": 0.4855259954929352,
      "rewards/cosine_scaled_reward": -0.17673744820058346,
      "rewards/format_reward": 0.5833333358168602,
      "step": 265
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 853.1042022705078,
      "epoch": 0.304,
      "grad_norm": 3.155418565951925,
      "kl": 1.138671875,
      "learning_rate": 5.78255733788191e-07,
      "loss": -0.0981,
      "reward": 0.23544084653258324,
      "reward_std": 0.5617225617170334,
      "rewards/cosine_scaled_reward": -0.18436292186379433,
      "rewards/format_reward": 0.6041666865348816,
      "step": 266
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1040.7291870117188,
      "epoch": 0.30514285714285716,
      "grad_norm": 4.49377424287265,
      "kl": 1.8671875,
      "learning_rate": 5.751196772469237e-07,
      "loss": 0.3133,
      "reward": 0.019660448655486107,
      "reward_std": 0.5969599932432175,
      "rewards/cosine_scaled_reward": -0.14641978219151497,
      "rewards/format_reward": 0.3125000149011612,
      "step": 267
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1120.3334045410156,
      "epoch": 0.3062857142857143,
      "grad_norm": 2.9296163486934588,
      "kl": 1.455078125,
      "learning_rate": 5.71982396408026e-07,
      "loss": 0.0891,
      "reward": 0.019381534308195114,
      "reward_std": 0.6385679095983505,
      "rewards/cosine_scaled_reward": -0.188225906342268,
      "rewards/format_reward": 0.3958333432674408,
      "step": 268
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 940.0833587646484,
      "epoch": 0.30742857142857144,
      "grad_norm": 3.99474649335861,
      "kl": 1.58203125,
      "learning_rate": 5.688440441781398e-07,
      "loss": 0.2037,
      "reward": 0.21233398653566837,
      "reward_std": 0.5940781682729721,
      "rewards/cosine_scaled_reward": -0.17508301883935928,
      "rewards/format_reward": 0.5625000149011612,
      "step": 269
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 820.8541870117188,
      "epoch": 0.30857142857142855,
      "grad_norm": 3.64920081986899,
      "kl": 1.548828125,
      "learning_rate": 5.657047735161255e-07,
      "loss": 0.187,
      "reward": 0.287849310785532,
      "reward_std": 0.7942548245191574,
      "rewards/cosine_scaled_reward": -0.16857536626048386,
      "rewards/format_reward": 0.6250000298023224,
      "step": 270
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 918.2708435058594,
      "epoch": 0.3097142857142857,
      "grad_norm": 4.142397150940974,
      "kl": 1.3642578125,
      "learning_rate": 5.625647374256061e-07,
      "loss": -0.0034,
      "reward": 0.21712711825966835,
      "reward_std": 0.7582554370164871,
      "rewards/cosine_scaled_reward": -0.1726864455267787,
      "rewards/format_reward": 0.5625000223517418,
      "step": 271
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1067.3125305175781,
      "epoch": 0.31085714285714283,
      "grad_norm": 5.568481701496752,
      "kl": 1.576171875,
      "learning_rate": 5.594240889475106e-07,
      "loss": 0.2629,
      "reward": 0.07018839695956558,
      "reward_std": 0.6307368651032448,
      "rewards/cosine_scaled_reward": -0.17323914170265198,
      "rewards/format_reward": 0.4166666753590107,
      "step": 272
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1032.7916870117188,
      "epoch": 0.312,
      "grad_norm": 2.7380334201594207,
      "kl": 1.763671875,
      "learning_rate": 5.562829811526154e-07,
      "loss": 0.1532,
      "reward": 0.1198783004656434,
      "reward_std": 0.5959479659795761,
      "rewards/cosine_scaled_reward": -0.15881085954606533,
      "rewards/format_reward": 0.4375000149011612,
      "step": 273
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1005.0000305175781,
      "epoch": 0.31314285714285717,
      "grad_norm": 3.288058849096818,
      "kl": 1.3232421875,
      "learning_rate": 5.531415671340826e-07,
      "loss": 0.0679,
      "reward": 0.33828355744481087,
      "reward_std": 0.7625949904322624,
      "rewards/cosine_scaled_reward": -0.1329415813088417,
      "rewards/format_reward": 0.6041666716337204,
      "step": 274
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1209.2083892822266,
      "epoch": 0.3142857142857143,
      "grad_norm": 3.384369498507843,
      "kl": 1.3759765625,
      "learning_rate": 5.5e-07,
      "loss": 0.1487,
      "reward": 0.2773652821779251,
      "reward_std": 0.7781829237937927,
      "rewards/cosine_scaled_reward": -0.09048402030020952,
      "rewards/format_reward": 0.45833334885537624,
      "step": 275
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 871.2500152587891,
      "epoch": 0.31542857142857145,
      "grad_norm": 3.6001944034052666,
      "kl": 1.2470703125,
      "learning_rate": 5.468584328659172e-07,
      "loss": 0.2545,
      "reward": 0.4259207919239998,
      "reward_std": 0.7986200153827667,
      "rewards/cosine_scaled_reward": -0.1099562719464302,
      "rewards/format_reward": 0.6458333432674408,
      "step": 276
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 970.1458740234375,
      "epoch": 0.31657142857142856,
      "grad_norm": 5.098242367200561,
      "kl": 1.9375,
      "learning_rate": 5.437170188473847e-07,
      "loss": 0.0347,
      "reward": 0.1577397957444191,
      "reward_std": 0.8665766268968582,
      "rewards/cosine_scaled_reward": -0.16071344492956996,
      "rewards/format_reward": 0.479166679084301,
      "step": 277
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1138.0417175292969,
      "epoch": 0.3177142857142857,
      "grad_norm": 4.893358334263393,
      "kl": 1.51953125,
      "learning_rate": 5.405759110524894e-07,
      "loss": 0.2335,
      "reward": 0.2129652127623558,
      "reward_std": 0.8123987764120102,
      "rewards/cosine_scaled_reward": -0.1122674010694027,
      "rewards/format_reward": 0.4375000149011612,
      "step": 278
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1055.5416870117188,
      "epoch": 0.31885714285714284,
      "grad_norm": 11.325087114885777,
      "kl": 1.70703125,
      "learning_rate": 5.37435262574394e-07,
      "loss": 0.1758,
      "reward": 0.2276703668758273,
      "reward_std": 0.7087787315249443,
      "rewards/cosine_scaled_reward": -0.14658149890601635,
      "rewards/format_reward": 0.520833358168602,
      "step": 279
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1201.5625305175781,
      "epoch": 0.32,
      "grad_norm": 4.499791162135755,
      "kl": 1.3359375,
      "learning_rate": 5.342952264838747e-07,
      "loss": 0.199,
      "reward": 0.4334499780088663,
      "reward_std": 0.8222155347466469,
      "rewards/cosine_scaled_reward": -0.0853583601419814,
      "rewards/format_reward": 0.6041666716337204,
      "step": 280
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1183.5833740234375,
      "epoch": 0.3211428571428571,
      "grad_norm": 3.6400895329844336,
      "kl": 1.931640625,
      "learning_rate": 5.311559558218603e-07,
      "loss": 0.0286,
      "reward": -0.14555206894874573,
      "reward_std": 0.4930955022573471,
      "rewards/cosine_scaled_reward": -0.2081927042454481,
      "rewards/format_reward": 0.2708333395421505,
      "step": 281
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1227.6875305175781,
      "epoch": 0.3222857142857143,
      "grad_norm": 3.351330372342759,
      "kl": 1.51953125,
      "learning_rate": 5.28017603591974e-07,
      "loss": 0.1735,
      "reward": 0.08991836942732334,
      "reward_std": 0.7664570957422256,
      "rewards/cosine_scaled_reward": -0.1946241520345211,
      "rewards/format_reward": 0.479166679084301,
      "step": 282
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1011.8958435058594,
      "epoch": 0.32342857142857145,
      "grad_norm": 3.607306150140324,
      "kl": 1.52734375,
      "learning_rate": 5.248803227530763e-07,
      "loss": 0.1756,
      "reward": -0.16347728297114372,
      "reward_std": 0.6131603866815567,
      "rewards/cosine_scaled_reward": -0.269238643348217,
      "rewards/format_reward": 0.3750000074505806,
      "step": 283
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1238.1875305175781,
      "epoch": 0.32457142857142857,
      "grad_norm": 3.700854838943554,
      "kl": 1.3291015625,
      "learning_rate": 5.21744266211809e-07,
      "loss": 0.0644,
      "reward": 0.19410160928964615,
      "reward_std": 0.6351519152522087,
      "rewards/cosine_scaled_reward": -0.16336587071418762,
      "rewards/format_reward": 0.520833358168602,
      "step": 284
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1302.5833435058594,
      "epoch": 0.32571428571428573,
      "grad_norm": 5.590443825333452,
      "kl": 1.396484375,
      "learning_rate": 5.186095868151436e-07,
      "loss": 0.1172,
      "reward": 0.0053066437467350625,
      "reward_std": 0.6190855652093887,
      "rewards/cosine_scaled_reward": -0.1952633447945118,
      "rewards/format_reward": 0.3958333507180214,
      "step": 285
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1408.7708587646484,
      "epoch": 0.32685714285714285,
      "grad_norm": 5820.413747461295,
      "kl": 44.6220703125,
      "learning_rate": 5.154764373429315e-07,
      "loss": 2.1366,
      "reward": 0.321873364970088,
      "reward_std": 0.7274122461676598,
      "rewards/cosine_scaled_reward": -0.06822998262941837,
      "rewards/format_reward": 0.45833334885537624,
      "step": 286
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1293.6875305175781,
      "epoch": 0.328,
      "grad_norm": 10688.293773017389,
      "kl": 90.048828125,
      "learning_rate": 5.123449705004581e-07,
      "loss": 3.6012,
      "reward": 0.22728685289621353,
      "reward_std": 0.6926668882369995,
      "rewards/cosine_scaled_reward": -0.10510657541453838,
      "rewards/format_reward": 0.4375000074505806,
      "step": 287
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1143.1042175292969,
      "epoch": 0.3291428571428571,
      "grad_norm": 69995.08344409091,
      "kl": 821.830078125,
      "learning_rate": 5.09215338910999e-07,
      "loss": 50.9221,
      "reward": 0.3029659762978554,
      "reward_std": 0.8068300932645798,
      "rewards/cosine_scaled_reward": -0.04643368790857494,
      "rewards/format_reward": 0.3958333432674408,
      "step": 288
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1325.7084045410156,
      "epoch": 0.3302857142857143,
      "grad_norm": 62.300695111663714,
      "kl": 1.5146484375,
      "learning_rate": 5.060876951083828e-07,
      "loss": 0.1171,
      "reward": 0.10640177875757217,
      "reward_std": 0.6392035633325577,
      "rewards/cosine_scaled_reward": -0.08221577852964401,
      "rewards/format_reward": 0.2708333358168602,
      "step": 289
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1066.375015258789,
      "epoch": 0.3314285714285714,
      "grad_norm": 3.0451709688438138,
      "kl": 0.85791015625,
      "learning_rate": 5.02962191529556e-07,
      "loss": 0.0875,
      "reward": 0.4837397076189518,
      "reward_std": 0.6303973346948624,
      "rewards/cosine_scaled_reward": -0.008130142465233803,
      "rewards/format_reward": 0.5000000074505806,
      "step": 290
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1176.2292175292969,
      "epoch": 0.3325714285714286,
      "grad_norm": 6.431194933370891,
      "kl": 1.169921875,
      "learning_rate": 4.998389805071536e-07,
      "loss": 0.0944,
      "reward": 0.004224353935569525,
      "reward_std": 0.7458223477005959,
      "rewards/cosine_scaled_reward": -0.17497116327285767,
      "rewards/format_reward": 0.354166679084301,
      "step": 291
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1279.1875610351562,
      "epoch": 0.33371428571428574,
      "grad_norm": 11.784461019524304,
      "kl": 1.0419921875,
      "learning_rate": 4.967182142620745e-07,
      "loss": 0.0752,
      "reward": -0.019843921065330505,
      "reward_std": 0.5733096897602081,
      "rewards/cosine_scaled_reward": -0.21825530380010605,
      "rewards/format_reward": 0.4166666753590107,
      "step": 292
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1270.3750305175781,
      "epoch": 0.33485714285714285,
      "grad_norm": 12451.222306718704,
      "kl": 56.82421875,
      "learning_rate": 4.93600044896063e-07,
      "loss": 2.6089,
      "reward": -0.0518635269254446,
      "reward_std": 0.4941852539777756,
      "rewards/cosine_scaled_reward": -0.22384843230247498,
      "rewards/format_reward": 0.3958333507180214,
      "step": 293
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1304.8750457763672,
      "epoch": 0.336,
      "grad_norm": 354145.9079404987,
      "kl": 3584.8046875,
      "learning_rate": 4.904846243842949e-07,
      "loss": 283.5748,
      "reward": 0.06046904996037483,
      "reward_std": 0.7505204379558563,
      "rewards/cosine_scaled_reward": -0.13643214339390397,
      "rewards/format_reward": 0.3333333358168602,
      "step": 294
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1317.5625610351562,
      "epoch": 0.33714285714285713,
      "grad_norm": 5.242464203702877,
      "kl": 1.0029296875,
      "learning_rate": 4.873721045679706e-07,
      "loss": 0.1195,
      "reward": 0.005757967010140419,
      "reward_std": 0.6009484976530075,
      "rewards/cosine_scaled_reward": -0.12212102208286524,
      "rewards/format_reward": 0.2500000111758709,
      "step": 295
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1103.0625610351562,
      "epoch": 0.3382857142857143,
      "grad_norm": 4.2430557491796055,
      "kl": 0.8115234375,
      "learning_rate": 4.842626371469149e-07,
      "loss": 0.0632,
      "reward": 0.0580328986980021,
      "reward_std": 0.6936925277113914,
      "rewards/cosine_scaled_reward": -0.15848355647176504,
      "rewards/format_reward": 0.3750000149011612,
      "step": 296
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1432.3333435058594,
      "epoch": 0.3394285714285714,
      "grad_norm": 2.9908283966206457,
      "kl": 0.7646484375,
      "learning_rate": 4.811563736721829e-07,
      "loss": 0.1022,
      "reward": -0.011708778678439558,
      "reward_std": 0.5683621913194656,
      "rewards/cosine_scaled_reward": -0.12043773010373116,
      "rewards/format_reward": 0.2291666716337204,
      "step": 297
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1451.8750305175781,
      "epoch": 0.3405714285714286,
      "grad_norm": 4.214445887739457,
      "kl": 0.673828125,
      "learning_rate": 4.780534655386743e-07,
      "loss": -0.0113,
      "reward": -0.12220606487244368,
      "reward_std": 0.5942584052681923,
      "rewards/cosine_scaled_reward": -0.18610304035246372,
      "rewards/format_reward": 0.2500000074505806,
      "step": 298
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1379.4791870117188,
      "epoch": 0.3417142857142857,
      "grad_norm": 4.524572878515851,
      "kl": 0.5302734375,
      "learning_rate": 4.749540639777539e-07,
      "loss": -0.0319,
      "reward": -0.08997016213834286,
      "reward_std": 0.6837709844112396,
      "rewards/cosine_scaled_reward": -0.1804017536342144,
      "rewards/format_reward": 0.27083334140479565,
      "step": 299
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1242.2083740234375,
      "epoch": 0.34285714285714286,
      "grad_norm": 22.44129435449986,
      "kl": 0.6015625,
      "learning_rate": 4.7185832004988133e-07,
      "loss": 0.047,
      "reward": 0.4733648784458637,
      "reward_std": 0.6498839557170868,
      "rewards/cosine_scaled_reward": -0.013317572651430964,
      "rewards/format_reward": 0.5000000111758709,
      "step": 300
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1358.5000610351562,
      "epoch": 0.344,
      "grad_norm": 5.451894779313779,
      "kl": 0.55419921875,
      "learning_rate": 4.68766384637248e-07,
      "loss": 0.0201,
      "reward": 0.012628388591110706,
      "reward_std": 0.6598528623580933,
      "rewards/cosine_scaled_reward": -0.11868580989539623,
      "rewards/format_reward": 0.2500000037252903,
      "step": 301
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1208.0000610351562,
      "epoch": 0.34514285714285714,
      "grad_norm": 2.502133066720727,
      "kl": 0.5078125,
      "learning_rate": 4.656784084364238e-07,
      "loss": 0.0976,
      "reward": 0.01287244912236929,
      "reward_std": 0.6720428466796875,
      "rewards/cosine_scaled_reward": -0.14981378242373466,
      "rewards/format_reward": 0.3125000111758709,
      "step": 302
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1372.2917175292969,
      "epoch": 0.3462857142857143,
      "grad_norm": 9.527527408809727,
      "kl": 0.591796875,
      "learning_rate": 4.6259454195101267e-07,
      "loss": -0.0351,
      "reward": -0.0026968184392899275,
      "reward_std": 0.7502148300409317,
      "rewards/cosine_scaled_reward": -0.1784317558631301,
      "rewards/format_reward": 0.3541666716337204,
      "step": 303
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1228.1458435058594,
      "epoch": 0.3474285714285714,
      "grad_norm": 5.5176774561091655,
      "kl": 0.4345703125,
      "learning_rate": 4.59514935484316e-07,
      "loss": 0.1598,
      "reward": 0.39222877379506826,
      "reward_std": 0.840458020567894,
      "rewards/cosine_scaled_reward": -0.03305228240787983,
      "rewards/format_reward": 0.4583333358168602,
      "step": 304
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1448.8125305175781,
      "epoch": 0.3485714285714286,
      "grad_norm": 7.801875434214254,
      "kl": 0.3525390625,
      "learning_rate": 4.5643973913200837e-07,
      "loss": 0.0808,
      "reward": 0.005279352888464928,
      "reward_std": 0.6858643740415573,
      "rewards/cosine_scaled_reward": -0.1536103216931224,
      "rewards/format_reward": 0.3125000074505806,
      "step": 305
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1402.6666870117188,
      "epoch": 0.3497142857142857,
      "grad_norm": 3.566822202421308,
      "kl": 0.29638671875,
      "learning_rate": 4.5336910277482155e-07,
      "loss": 0.0791,
      "reward": 0.18335522711277008,
      "reward_std": 0.6350644528865814,
      "rewards/cosine_scaled_reward": -0.13748905574902892,
      "rewards/format_reward": 0.4583333432674408,
      "step": 306
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1421.8541870117188,
      "epoch": 0.35085714285714287,
      "grad_norm": 1.9532542741070622,
      "kl": 0.289794921875,
      "learning_rate": 4.503031760712397e-07,
      "loss": 0.0514,
      "reward": 0.2609965428709984,
      "reward_std": 0.7012953609228134,
      "rewards/cosine_scaled_reward": -0.06741839554160833,
      "rewards/format_reward": 0.39583334885537624,
      "step": 307
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1331.2500305175781,
      "epoch": 0.352,
      "grad_norm": 2.135773174322825,
      "kl": 0.26416015625,
      "learning_rate": 4.4724210845020494e-07,
      "loss": 0.1508,
      "reward": 0.21997906267642975,
      "reward_std": 0.6842755973339081,
      "rewards/cosine_scaled_reward": -0.1191771375015378,
      "rewards/format_reward": 0.4583333358168602,
      "step": 308
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1291.1666870117188,
      "epoch": 0.35314285714285715,
      "grad_norm": 3.030174625800062,
      "kl": 0.323486328125,
      "learning_rate": 4.441860491038345e-07,
      "loss": 0.1012,
      "reward": -0.060309079475700855,
      "reward_std": 0.48270438611507416,
      "rewards/cosine_scaled_reward": -0.16557121649384499,
      "rewards/format_reward": 0.2708333469927311,
      "step": 309
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1296.8125457763672,
      "epoch": 0.35428571428571426,
      "grad_norm": 3.288974321286699,
      "kl": 0.30712890625,
      "learning_rate": 4.4113514698014953e-07,
      "loss": 0.1053,
      "reward": 0.3812438789755106,
      "reward_std": 0.6454566046595573,
      "rewards/cosine_scaled_reward": 0.0031219255179166794,
      "rewards/format_reward": 0.37500000558793545,
      "step": 310
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1582.4167175292969,
      "epoch": 0.3554285714285714,
      "grad_norm": 11.037201589242047,
      "kl": 0.3916015625,
      "learning_rate": 4.3808955077581546e-07,
      "loss": 0.0554,
      "reward": 0.011564895510673523,
      "reward_std": 0.5866778641939163,
      "rewards/cosine_scaled_reward": -0.12963422574102879,
      "rewards/format_reward": 0.27083333767950535,
      "step": 311
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1511.1458740234375,
      "epoch": 0.3565714285714286,
      "grad_norm": 541.360267852673,
      "kl": 2.48046875,
      "learning_rate": 4.350494089288943e-07,
      "loss": 0.1743,
      "reward": 0.09507806971669197,
      "reward_std": 0.7126565277576447,
      "rewards/cosine_scaled_reward": -0.12954430282115936,
      "rewards/format_reward": 0.3541666716337204,
      "step": 312
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1310.4792022705078,
      "epoch": 0.3577142857142857,
      "grad_norm": 1.6060292822301743,
      "kl": 0.2235107421875,
      "learning_rate": 4.3201486961161093e-07,
      "loss": 0.0119,
      "reward": 0.19681214727461338,
      "reward_std": 0.5347588732838631,
      "rewards/cosine_scaled_reward": -0.14117726124823093,
      "rewards/format_reward": 0.4791666902601719,
      "step": 313
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1412.7708740234375,
      "epoch": 0.3588571428571429,
      "grad_norm": 0.9234012789427545,
      "kl": 0.2705078125,
      "learning_rate": 4.2898608072313045e-07,
      "loss": 0.0522,
      "reward": 0.1253851738292724,
      "reward_std": 0.5503663271665573,
      "rewards/cosine_scaled_reward": -0.1352240853011608,
      "rewards/format_reward": 0.3958333469927311,
      "step": 314
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1183.7916870117188,
      "epoch": 0.36,
      "grad_norm": 1.7837131712349448,
      "kl": 0.248779296875,
      "learning_rate": 4.2596318988235037e-07,
      "loss": 0.1102,
      "reward": 0.06632774323225021,
      "reward_std": 0.8003478944301605,
      "rewards/cosine_scaled_reward": -0.14391947723925114,
      "rewards/format_reward": 0.354166679084301,
      "step": 315
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1321.1667022705078,
      "epoch": 0.36114285714285715,
      "grad_norm": 3.8904561936208473,
      "kl": 0.311279296875,
      "learning_rate": 4.2294634442070553e-07,
      "loss": 0.0684,
      "reward": -0.12211128510534763,
      "reward_std": 0.3644377589225769,
      "rewards/cosine_scaled_reward": -0.19647231698036194,
      "rewards/format_reward": 0.27083333767950535,
      "step": 316
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1472.5208435058594,
      "epoch": 0.36228571428571427,
      "grad_norm": 0.6761305622628668,
      "kl": 0.2392578125,
      "learning_rate": 4.1993569137498776e-07,
      "loss": 0.0051,
      "reward": 0.07694595551583916,
      "reward_std": 0.698570191860199,
      "rewards/cosine_scaled_reward": -0.08652702532708645,
      "rewards/format_reward": 0.25000000186264515,
      "step": 317
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1446.3959045410156,
      "epoch": 0.36342857142857143,
      "grad_norm": 1.610083766620256,
      "kl": 0.23046875,
      "learning_rate": 4.1693137748017915e-07,
      "loss": 0.1272,
      "reward": 0.22593690548092127,
      "reward_std": 0.7007799595594406,
      "rewards/cosine_scaled_reward": -0.11619820445775986,
      "rewards/format_reward": 0.45833336375653744,
      "step": 318
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1419.7500610351562,
      "epoch": 0.36457142857142855,
      "grad_norm": 1.3177147357732026,
      "kl": 0.3505859375,
      "learning_rate": 4.1393354916230005e-07,
      "loss": 0.0908,
      "reward": 0.05421498417854309,
      "reward_std": 0.6087209582328796,
      "rewards/cosine_scaled_reward": -0.10830917488783598,
      "rewards/format_reward": 0.27083334140479565,
      "step": 319
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1459.2917175292969,
      "epoch": 0.3657142857142857,
      "grad_norm": 2.383045046585821,
      "kl": 0.177001953125,
      "learning_rate": 4.1094235253127374e-07,
      "loss": 0.143,
      "reward": 0.23994141444563866,
      "reward_std": 0.7169264256954193,
      "rewards/cosine_scaled_reward": -0.08836262859404087,
      "rewards/format_reward": 0.4166666753590107,
      "step": 320
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1525.8542175292969,
      "epoch": 0.3668571428571429,
      "grad_norm": 1.4014039132566267,
      "kl": 0.327880859375,
      "learning_rate": 4.079579333738039e-07,
      "loss": 0.0636,
      "reward": 0.07618786534294486,
      "reward_std": 0.6110149621963501,
      "rewards/cosine_scaled_reward": -0.17023939825594425,
      "rewards/format_reward": 0.416666679084301,
      "step": 321
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1505.1042022705078,
      "epoch": 0.368,
      "grad_norm": 0.9016635108285753,
      "kl": 0.18017578125,
      "learning_rate": 4.0498043714627006e-07,
      "loss": 0.0766,
      "reward": 0.1637781597673893,
      "reward_std": 0.6868859454989433,
      "rewards/cosine_scaled_reward": -0.13686091732233763,
      "rewards/format_reward": 0.4375000186264515,
      "step": 322
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1378.7500305175781,
      "epoch": 0.36914285714285716,
      "grad_norm": 1.1982814454055981,
      "kl": 0.39306640625,
      "learning_rate": 4.020100089676376e-07,
      "loss": 0.0913,
      "reward": 0.17529202857986093,
      "reward_std": 0.6956184059381485,
      "rewards/cosine_scaled_reward": -0.14152065757662058,
      "rewards/format_reward": 0.4583333432674408,
      "step": 323
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1414.5208740234375,
      "epoch": 0.3702857142857143,
      "grad_norm": 5.168812943695912,
      "kl": 0.2706298828125,
      "learning_rate": 3.9904679361238526e-07,
      "loss": 0.0758,
      "reward": -0.05163134215399623,
      "reward_std": 0.573038712143898,
      "rewards/cosine_scaled_reward": -0.2133156731724739,
      "rewards/format_reward": 0.37500000558793545,
      "step": 324
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1026.2292175292969,
      "epoch": 0.37142857142857144,
      "grad_norm": 2.717389747197644,
      "kl": 0.2042236328125,
      "learning_rate": 3.9609093550344907e-07,
      "loss": 0.0446,
      "reward": 0.35916636511683464,
      "reward_std": 0.7165441811084747,
      "rewards/cosine_scaled_reward": -0.11208349000662565,
      "rewards/format_reward": 0.583333358168602,
      "step": 325
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1296.2083435058594,
      "epoch": 0.37257142857142855,
      "grad_norm": 0.9706132798560072,
      "kl": 0.1436767578125,
      "learning_rate": 3.931425787051832e-07,
      "loss": 0.0264,
      "reward": 0.03931037150323391,
      "reward_std": 0.5944674462080002,
      "rewards/cosine_scaled_reward": -0.24076148495078087,
      "rewards/format_reward": 0.5208333395421505,
      "step": 326
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1119.3958740234375,
      "epoch": 0.3737142857142857,
      "grad_norm": 7.20904098295775,
      "kl": 0.34619140625,
      "learning_rate": 3.902018669163384e-07,
      "loss": 0.0023,
      "reward": 0.5026027010753751,
      "reward_std": 0.4505321756005287,
      "rewards/cosine_scaled_reward": 0.011718038469552994,
      "rewards/format_reward": 0.4791666716337204,
      "step": 327
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1340.9791717529297,
      "epoch": 0.37485714285714283,
      "grad_norm": 1.2860908020915138,
      "kl": 0.416259765625,
      "learning_rate": 3.872689434630585e-07,
      "loss": 0.1449,
      "reward": 0.15127216652035713,
      "reward_std": 0.6304197087883949,
      "rewards/cosine_scaled_reward": -0.15353058651089668,
      "rewards/format_reward": 0.4583333507180214,
      "step": 328
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1571.3750610351562,
      "epoch": 0.376,
      "grad_norm": 0.8293118478307562,
      "kl": 0.242431640625,
      "learning_rate": 3.843439512918949e-07,
      "loss": 0.0229,
      "reward": 0.09288652800023556,
      "reward_std": 0.5842361897230148,
      "rewards/cosine_scaled_reward": -0.15147340297698975,
      "rewards/format_reward": 0.3958333432674408,
      "step": 329
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1407.9583740234375,
      "epoch": 0.37714285714285717,
      "grad_norm": 1.189781094856149,
      "kl": 0.1077880859375,
      "learning_rate": 3.8142703296283953e-07,
      "loss": 0.0681,
      "reward": -0.09090141206979752,
      "reward_std": 0.5390855148434639,
      "rewards/cosine_scaled_reward": -0.21211737021803856,
      "rewards/format_reward": 0.3333333395421505,
      "step": 330
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1255.6875457763672,
      "epoch": 0.3782857142857143,
      "grad_norm": 1.046472107288498,
      "kl": 0.10308837890625,
      "learning_rate": 3.785183306423767e-07,
      "loss": 0.0811,
      "reward": -0.12841611605836079,
      "reward_std": 0.39798876643180847,
      "rewards/cosine_scaled_reward": -0.3350413963198662,
      "rewards/format_reward": 0.5416666865348816,
      "step": 331
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1201.8958740234375,
      "epoch": 0.37942857142857145,
      "grad_norm": 1.123018980255247,
      "kl": 0.117584228515625,
      "learning_rate": 3.7561798609655373e-07,
      "loss": 0.072,
      "reward": 0.499036006629467,
      "reward_std": 0.6711834743618965,
      "rewards/cosine_scaled_reward": -0.03173201950266957,
      "rewards/format_reward": 0.5625000074505806,
      "step": 332
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1133.3333587646484,
      "epoch": 0.38057142857142856,
      "grad_norm": 2.177638459571002,
      "kl": 0.14453125,
      "learning_rate": 3.72726140684072e-07,
      "loss": 0.1488,
      "reward": 0.03351620538160205,
      "reward_std": 0.4431127682328224,
      "rewards/cosine_scaled_reward": -0.27490856871008873,
      "rewards/format_reward": 0.5833333432674408,
      "step": 333
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1252.5833587646484,
      "epoch": 0.38171428571428573,
      "grad_norm": 1.6680786188797292,
      "kl": 2.4984130859375,
      "learning_rate": 3.6984293534939737e-07,
      "loss": 0.1246,
      "reward": -0.1514057070016861,
      "reward_std": 0.5695896856486797,
      "rewards/cosine_scaled_reward": -0.26320285350084305,
      "rewards/format_reward": 0.3750000074505806,
      "step": 334
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1232.0000610351562,
      "epoch": 0.38285714285714284,
      "grad_norm": 1.828125714274309,
      "kl": 0.07843017578125,
      "learning_rate": 3.6696851061588994e-07,
      "loss": 0.1105,
      "reward": 0.07522661844268441,
      "reward_std": 0.5525132827460766,
      "rewards/cosine_scaled_reward": -0.21238669380545616,
      "rewards/format_reward": 0.5000000149011612,
      "step": 335
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1240.4375305175781,
      "epoch": 0.384,
      "grad_norm": 3.2255921262965432,
      "kl": 0.19232177734375,
      "learning_rate": 3.641030065789562e-07,
      "loss": 0.2104,
      "reward": -0.07903135940432549,
      "reward_std": 0.4235813617706299,
      "rewards/cosine_scaled_reward": -0.3103490248322487,
      "rewards/format_reward": 0.5416666865348816,
      "step": 336
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1136.2500457763672,
      "epoch": 0.3851428571428571,
      "grad_norm": 2.1359050155328076,
      "kl": 0.298095703125,
      "learning_rate": 3.612465628992203e-07,
      "loss": 0.1271,
      "reward": 0.29203586652874947,
      "reward_std": 0.6221929639577866,
      "rewards/cosine_scaled_reward": -0.14564874302595854,
      "rewards/format_reward": 0.583333358168602,
      "step": 337
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1304.4792175292969,
      "epoch": 0.3862857142857143,
      "grad_norm": 1.42801024449987,
      "kl": 0.2041015625,
      "learning_rate": 3.5839931879571725e-07,
      "loss": 0.0306,
      "reward": -0.07640792615711689,
      "reward_std": 0.29374565184116364,
      "rewards/cosine_scaled_reward": -0.30903729796409607,
      "rewards/format_reward": 0.5416666716337204,
      "step": 338
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1483.5625305175781,
      "epoch": 0.38742857142857146,
      "grad_norm": 4.530770296915891,
      "kl": 0.2216796875,
      "learning_rate": 3.555614130391079e-07,
      "loss": 0.0756,
      "reward": -0.22593690641224384,
      "reward_std": 0.42642898857593536,
      "rewards/cosine_scaled_reward": -0.31088512018322945,
      "rewards/format_reward": 0.39583334513008595,
      "step": 339
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1301.6875305175781,
      "epoch": 0.38857142857142857,
      "grad_norm": 32.229056752997074,
      "kl": 0.72998046875,
      "learning_rate": 3.5273298394491515e-07,
      "loss": 0.0451,
      "reward": 0.1187155619263649,
      "reward_std": 0.6100866496562958,
      "rewards/cosine_scaled_reward": -0.16980887576937675,
      "rewards/format_reward": 0.4583333432674408,
      "step": 340
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1222.5000610351562,
      "epoch": 0.38971428571428574,
      "grad_norm": 31.15024931066955,
      "kl": 1.814453125,
      "learning_rate": 3.4991416936678276e-07,
      "loss": 0.0053,
      "reward": 0.4647822715342045,
      "reward_std": 0.8535723686218262,
      "rewards/cosine_scaled_reward": 0.013641122728586197,
      "rewards/format_reward": 0.4375000149011612,
      "step": 341
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1550.9583740234375,
      "epoch": 0.39085714285714285,
      "grad_norm": 5.073035047796139,
      "kl": 0.40185546875,
      "learning_rate": 3.471051066897562e-07,
      "loss": 0.1274,
      "reward": -0.049222253262996674,
      "reward_std": 0.6296448782086372,
      "rewards/cosine_scaled_reward": -0.1704444605857134,
      "rewards/format_reward": 0.29166667349636555,
      "step": 342
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1254.1458740234375,
      "epoch": 0.392,
      "grad_norm": 2.9987047793682247,
      "kl": 0.191650390625,
      "learning_rate": 3.4430593282358777e-07,
      "loss": 0.132,
      "reward": 0.4507103096693754,
      "reward_std": 0.46682045608758926,
      "rewards/cosine_scaled_reward": -0.11839485540986061,
      "rewards/format_reward": 0.6875000298023224,
      "step": 343
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1259.0833740234375,
      "epoch": 0.3931428571428571,
      "grad_norm": 11.834773130920754,
      "kl": 0.765625,
      "learning_rate": 3.4151678419606233e-07,
      "loss": 0.1692,
      "reward": 0.04102582670748234,
      "reward_std": 0.6375212371349335,
      "rewards/cosine_scaled_reward": -0.16698708944022655,
      "rewards/format_reward": 0.3750000111758709,
      "step": 344
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 948.8333587646484,
      "epoch": 0.3942857142857143,
      "grad_norm": 4.082579051373274,
      "kl": 0.11126708984375,
      "learning_rate": 3.387377967463493e-07,
      "loss": 0.1531,
      "reward": 0.32552773877978325,
      "reward_std": 0.5937002822756767,
      "rewards/cosine_scaled_reward": -0.18098615854978561,
      "rewards/format_reward": 0.6875000149011612,
      "step": 345
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1348.5208740234375,
      "epoch": 0.3954285714285714,
      "grad_norm": 4.16581520032074,
      "kl": 0.233154296875,
      "learning_rate": 3.359691059183761e-07,
      "loss": 0.0891,
      "reward": -0.024696938693523407,
      "reward_std": 0.6840994879603386,
      "rewards/cosine_scaled_reward": -0.2310984805226326,
      "rewards/format_reward": 0.4375000074505806,
      "step": 346
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1152.9583435058594,
      "epoch": 0.3965714285714286,
      "grad_norm": 6.491892036842968,
      "kl": 0.688232421875,
      "learning_rate": 3.3321084665422803e-07,
      "loss": 0.1813,
      "reward": 0.7761995047330856,
      "reward_std": 0.9014021009206772,
      "rewards/cosine_scaled_reward": 0.08601640490815043,
      "rewards/format_reward": 0.6041666865348816,
      "step": 347
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1119.6875305175781,
      "epoch": 0.3977142857142857,
      "grad_norm": 6.465035426418669,
      "kl": 0.2274169921875,
      "learning_rate": 3.3046315338757026e-07,
      "loss": 0.3084,
      "reward": 0.1041297996416688,
      "reward_std": 0.5661944150924683,
      "rewards/cosine_scaled_reward": -0.2187684327363968,
      "rewards/format_reward": 0.541666679084301,
      "step": 348
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1280.5000305175781,
      "epoch": 0.39885714285714285,
      "grad_norm": 5.965340713566614,
      "kl": 0.0919189453125,
      "learning_rate": 3.2772616003709616e-07,
      "loss": 0.264,
      "reward": 0.5343287643045187,
      "reward_std": 1.0619665831327438,
      "rewards/cosine_scaled_reward": -0.024502300075255334,
      "rewards/format_reward": 0.5833333432674408,
      "step": 349
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1420.2291870117188,
      "epoch": 0.4,
      "grad_norm": 2.925238124886515,
      "kl": 0.185791015625,
      "learning_rate": 3.250000000000001e-07,
      "loss": 0.1961,
      "reward": 0.12700789980590343,
      "reward_std": 0.8331074118614197,
      "rewards/cosine_scaled_reward": -0.1656627282500267,
      "rewards/format_reward": 0.4583333432674408,
      "step": 350
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1083.6042175292969,
      "epoch": 0.40114285714285713,
      "grad_norm": 3.606767246674259,
      "kl": 0.18115234375,
      "learning_rate": 3.222848061454764e-07,
      "loss": -0.0154,
      "reward": 0.25727599672973156,
      "reward_std": 0.6387183666229248,
      "rewards/cosine_scaled_reward": -0.18386201839894056,
      "rewards/format_reward": 0.6250000074505806,
      "step": 351
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1439.1250610351562,
      "epoch": 0.4022857142857143,
      "grad_norm": 1.929818758425276,
      "kl": 0.1397705078125,
      "learning_rate": 3.195807108082429e-07,
      "loss": 0.1728,
      "reward": -0.14825151395052671,
      "reward_std": 0.5558790042996407,
      "rewards/cosine_scaled_reward": -0.2824591100215912,
      "rewards/format_reward": 0.4166666716337204,
      "step": 352
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1523.3542175292969,
      "epoch": 0.4034285714285714,
      "grad_norm": 1.501137402879622,
      "kl": 0.15606689453125,
      "learning_rate": 3.168878457820915e-07,
      "loss": 0.1054,
      "reward": -0.2005203291773796,
      "reward_std": 0.5384240373969078,
      "rewards/cosine_scaled_reward": -0.2565101645886898,
      "rewards/format_reward": 0.31250000558793545,
      "step": 353
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1457.8125305175781,
      "epoch": 0.4045714285714286,
      "grad_norm": 1.7447813143906967,
      "kl": 0.19287109375,
      "learning_rate": 3.142063423134644e-07,
      "loss": 0.0946,
      "reward": -0.07205517496913671,
      "reward_std": 0.5912996232509613,
      "rewards/cosine_scaled_reward": -0.27561092376708984,
      "rewards/format_reward": 0.4791666865348816,
      "step": 354
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 935.1041793823242,
      "epoch": 0.4057142857142857,
      "grad_norm": 5.735907828017728,
      "kl": 0.416259765625,
      "learning_rate": 3.115363310950578e-07,
      "loss": 0.2126,
      "reward": 0.6018264503218234,
      "reward_std": 0.43670547753572464,
      "rewards/cosine_scaled_reward": -0.04283679276704788,
      "rewards/format_reward": 0.6875000149011612,
      "step": 355
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1400.4375610351562,
      "epoch": 0.40685714285714286,
      "grad_norm": 4.2513620245343855,
      "kl": 0.2110595703125,
      "learning_rate": 3.0887794225945143e-07,
      "loss": 0.0986,
      "reward": 0.07107849605381489,
      "reward_std": 0.6532387360930443,
      "rewards/cosine_scaled_reward": -0.22487742826342583,
      "rewards/format_reward": 0.5208333432674408,
      "step": 356
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1214.1250305175781,
      "epoch": 0.408,
      "grad_norm": 1.8135177203210504,
      "kl": 0.1314697265625,
      "learning_rate": 3.062313053727671e-07,
      "loss": 0.1426,
      "reward": 0.03724817745387554,
      "reward_std": 0.5181447230279446,
      "rewards/cosine_scaled_reward": -0.2730425810441375,
      "rewards/format_reward": 0.5833333507180214,
      "step": 357
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1284.7500305175781,
      "epoch": 0.40914285714285714,
      "grad_norm": 3.565695417018542,
      "kl": 0.18597412109375,
      "learning_rate": 3.0359654942835247e-07,
      "loss": 0.1245,
      "reward": 0.04130622744560242,
      "reward_std": 0.7205251231789589,
      "rewards/cosine_scaled_reward": -0.19809689931571484,
      "rewards/format_reward": 0.4375000111758709,
      "step": 358
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1343.2083740234375,
      "epoch": 0.4102857142857143,
      "grad_norm": 3.2057830256260917,
      "kl": 0.14324951171875,
      "learning_rate": 3.0097380284049523e-07,
      "loss": -0.0078,
      "reward": 0.1697351299226284,
      "reward_std": 0.3564612567424774,
      "rewards/cosine_scaled_reward": -0.13388244062662125,
      "rewards/format_reward": 0.4375000111758709,
      "step": 359
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1433.2291870117188,
      "epoch": 0.4114285714285714,
      "grad_norm": 1.6762255456245136,
      "kl": 0.1739501953125,
      "learning_rate": 2.9836319343816397e-07,
      "loss": 0.1781,
      "reward": 0.21988008171319962,
      "reward_std": 0.7903619408607483,
      "rewards/cosine_scaled_reward": -0.10880996193736792,
      "rewards/format_reward": 0.4375000223517418,
      "step": 360
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1309.6250457763672,
      "epoch": 0.4125714285714286,
      "grad_norm": 0.9826821036841882,
      "kl": 0.135498046875,
      "learning_rate": 2.9576484845877793e-07,
      "loss": 0.0186,
      "reward": 0.33486853912472725,
      "reward_std": 0.500580433756113,
      "rewards/cosine_scaled_reward": -0.11381572997197509,
      "rewards/format_reward": 0.5625000149011612,
      "step": 361
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1173.5625305175781,
      "epoch": 0.4137142857142857,
      "grad_norm": 4.226570835684713,
      "kl": 0.0926513671875,
      "learning_rate": 2.931788945420058e-07,
      "loss": 0.18,
      "reward": 0.15393588319420815,
      "reward_std": 0.5774414390325546,
      "rewards/cosine_scaled_reward": -0.20428206771612167,
      "rewards/format_reward": 0.5625000149011612,
      "step": 362
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1250.0416870117188,
      "epoch": 0.41485714285714287,
      "grad_norm": 1.978862188088671,
      "kl": 0.09033203125,
      "learning_rate": 2.9060545772359305e-07,
      "loss": 0.1327,
      "reward": 0.2741839215159416,
      "reward_std": 0.6551093906164169,
      "rewards/cosine_scaled_reward": -0.17540805786848068,
      "rewards/format_reward": 0.6250000149011612,
      "step": 363
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1354.9583892822266,
      "epoch": 0.416,
      "grad_norm": 1.500971160749094,
      "kl": 0.1431884765625,
      "learning_rate": 2.8804466342921987e-07,
      "loss": 0.1556,
      "reward": 0.09914333745837212,
      "reward_std": 0.5969183072447777,
      "rewards/cosine_scaled_reward": -0.17959501221776009,
      "rewards/format_reward": 0.4583333432674408,
      "step": 364
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1443.8958435058594,
      "epoch": 0.41714285714285715,
      "grad_norm": 1.5336203716893533,
      "kl": 0.14166259765625,
      "learning_rate": 2.854966364683872e-07,
      "loss": 0.0794,
      "reward": 0.08230920624919236,
      "reward_std": 0.7491874545812607,
      "rewards/cosine_scaled_reward": -0.18801206350326538,
      "rewards/format_reward": 0.4583333358168602,
      "step": 365
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1280.0833740234375,
      "epoch": 0.41828571428571426,
      "grad_norm": 5.917103922817008,
      "kl": 0.1395263671875,
      "learning_rate": 2.829615010283344e-07,
      "loss": 0.2201,
      "reward": 0.30844624526798725,
      "reward_std": 0.6032929718494415,
      "rewards/cosine_scaled_reward": -0.11661022901535034,
      "rewards/format_reward": 0.5416666865348816,
      "step": 366
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1071.7083587646484,
      "epoch": 0.41942857142857143,
      "grad_norm": 6.764653159306351,
      "kl": 1.21533203125,
      "learning_rate": 2.8043938066798645e-07,
      "loss": 0.2217,
      "reward": 0.5629880558699369,
      "reward_std": 0.7271402254700661,
      "rewards/cosine_scaled_reward": -0.02058931067585945,
      "rewards/format_reward": 0.6041666716337204,
      "step": 367
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1272.2708892822266,
      "epoch": 0.4205714285714286,
      "grad_norm": 3.868751461553908,
      "kl": 0.376953125,
      "learning_rate": 2.7793039831193133e-07,
      "loss": 0.0282,
      "reward": 0.2414314430207014,
      "reward_std": 0.783539354801178,
      "rewards/cosine_scaled_reward": -0.13970092684030533,
      "rewards/format_reward": 0.5208333544433117,
      "step": 368
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1453.8333740234375,
      "epoch": 0.4217142857142857,
      "grad_norm": 1.6191974125060598,
      "kl": 0.29150390625,
      "learning_rate": 2.7543467624442956e-07,
      "loss": 0.172,
      "reward": 0.11266430467367172,
      "reward_std": 0.7149153798818588,
      "rewards/cosine_scaled_reward": -0.1415845244191587,
      "rewards/format_reward": 0.39583334140479565,
      "step": 369
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1463.3959045410156,
      "epoch": 0.4228571428571429,
      "grad_norm": 4.101308083609096,
      "kl": 0.56884765625,
      "learning_rate": 2.729523361034538e-07,
      "loss": 0.2149,
      "reward": -0.2552230432629585,
      "reward_std": 0.5415500551462173,
      "rewards/cosine_scaled_reward": -0.26302820444107056,
      "rewards/format_reward": 0.27083334140479565,
      "step": 370
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1240.8542175292969,
      "epoch": 0.424,
      "grad_norm": 3.8927886605185447,
      "kl": 0.30340576171875,
      "learning_rate": 2.7048349887476037e-07,
      "loss": 0.1602,
      "reward": 0.1614240426570177,
      "reward_std": 0.5875495374202728,
      "rewards/cosine_scaled_reward": -0.15887131541967392,
      "rewards/format_reward": 0.479166679084301,
      "step": 371
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1177.3958740234375,
      "epoch": 0.42514285714285716,
      "grad_norm": 3.066569475752354,
      "kl": 0.1824951171875,
      "learning_rate": 2.6802828488599294e-07,
      "loss": 0.1059,
      "reward": 0.2956250160932541,
      "reward_std": 0.6594211757183075,
      "rewards/cosine_scaled_reward": -0.12302083522081375,
      "rewards/format_reward": 0.5416666939854622,
      "step": 372
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1163.4375610351562,
      "epoch": 0.42628571428571427,
      "grad_norm": 5.09566585578463,
      "kl": 0.2724609375,
      "learning_rate": 2.655868138008171e-07,
      "loss": 0.1544,
      "reward": 0.07318597589619458,
      "reward_std": 0.7096846550703049,
      "rewards/cosine_scaled_reward": -0.2759070098400116,
      "rewards/format_reward": 0.6250000149011612,
      "step": 373
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1246.7500457763672,
      "epoch": 0.42742857142857144,
      "grad_norm": 32.203352857308325,
      "kl": 0.839111328125,
      "learning_rate": 2.631592046130896e-07,
      "loss": 0.1927,
      "reward": 0.08969515189528465,
      "reward_std": 0.6610818058252335,
      "rewards/cosine_scaled_reward": -0.22598576080054045,
      "rewards/format_reward": 0.5416666865348816,
      "step": 374
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1280.7708740234375,
      "epoch": 0.42857142857142855,
      "grad_norm": 63.335567619096544,
      "kl": 0.94873046875,
      "learning_rate": 2.6074557564105724e-07,
      "loss": 0.2184,
      "reward": 0.18546735402196646,
      "reward_std": 0.9102050960063934,
      "rewards/cosine_scaled_reward": -0.17809965554624796,
      "rewards/format_reward": 0.5416667014360428,
      "step": 375
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1256.6250305175781,
      "epoch": 0.4297142857142857,
      "grad_norm": 3.6519960558396716,
      "kl": 0.310302734375,
      "learning_rate": 2.583460445215911e-07,
      "loss": 0.1114,
      "reward": 0.1940733604133129,
      "reward_std": 0.5819907337427139,
      "rewards/cosine_scaled_reward": -0.1946299858391285,
      "rewards/format_reward": 0.583333358168602,
      "step": 376
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1285.6458740234375,
      "epoch": 0.4308571428571429,
      "grad_norm": 5.319529708040252,
      "kl": 0.3394775390625,
      "learning_rate": 2.5596072820445254e-07,
      "loss": 0.0359,
      "reward": 0.25018906872719526,
      "reward_std": 0.8042758777737617,
      "rewards/cosine_scaled_reward": -0.13532213680446148,
      "rewards/format_reward": 0.5208333432674408,
      "step": 377
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1374.5416870117188,
      "epoch": 0.432,
      "grad_norm": 20.461016176615068,
      "kl": 0.70166015625,
      "learning_rate": 2.5358974294659373e-07,
      "loss": 0.2346,
      "reward": -0.005498896003700793,
      "reward_std": 0.5357099026441574,
      "rewards/cosine_scaled_reward": -0.22149945423007011,
      "rewards/format_reward": 0.4375000074505806,
      "step": 378
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1435.1875,
      "epoch": 0.43314285714285716,
      "grad_norm": 2.391831824846237,
      "kl": 0.292236328125,
      "learning_rate": 2.512332043064913e-07,
      "loss": 0.1982,
      "reward": 0.012932289391756058,
      "reward_std": 0.799980454146862,
      "rewards/cosine_scaled_reward": -0.20186719112098217,
      "rewards/format_reward": 0.4166666865348816,
      "step": 379
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1369.5833740234375,
      "epoch": 0.4342857142857143,
      "grad_norm": 2.2747857355280208,
      "kl": 0.1715087890625,
      "learning_rate": 2.488912271385139e-07,
      "loss": 0.1725,
      "reward": -0.22791396314278245,
      "reward_std": 0.4170580878853798,
      "rewards/cosine_scaled_reward": -0.3431236445903778,
      "rewards/format_reward": 0.4583333544433117,
      "step": 380
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1187.6458587646484,
      "epoch": 0.43542857142857144,
      "grad_norm": 2.7553958817382593,
      "kl": 0.16162109375,
      "learning_rate": 2.465639255873246e-07,
      "loss": 0.1247,
      "reward": 0.19117721682414412,
      "reward_std": 0.46048377081751823,
      "rewards/cosine_scaled_reward": -0.23774472624063492,
      "rewards/format_reward": 0.6666667014360428,
      "step": 381
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1300.3125305175781,
      "epoch": 0.43657142857142855,
      "grad_norm": 2.0362039263750082,
      "kl": 0.1822509765625,
      "learning_rate": 2.4425141308231765e-07,
      "loss": 0.1158,
      "reward": 0.2739548869431019,
      "reward_std": 0.603746622800827,
      "rewards/cosine_scaled_reward": -0.09218922536820173,
      "rewards/format_reward": 0.4583333432674408,
      "step": 382
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1263.7917175292969,
      "epoch": 0.4377142857142857,
      "grad_norm": 7.617696331239462,
      "kl": 0.2333984375,
      "learning_rate": 2.4195380233209006e-07,
      "loss": 0.1076,
      "reward": 0.12070683389902115,
      "reward_std": 0.38592402543872595,
      "rewards/cosine_scaled_reward": -0.18964658118784428,
      "rewards/format_reward": 0.5,
      "step": 383
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1170.3750457763672,
      "epoch": 0.43885714285714283,
      "grad_norm": 3.2601623912372233,
      "kl": 0.2103271484375,
      "learning_rate": 2.3967120531894857e-07,
      "loss": 0.1471,
      "reward": -0.021999074146151543,
      "reward_std": 0.34355130419135094,
      "rewards/cosine_scaled_reward": -0.31308288127183914,
      "rewards/format_reward": 0.6041666865348816,
      "step": 384
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1233.7083740234375,
      "epoch": 0.44,
      "grad_norm": 2.1916650637468025,
      "kl": 0.16259765625,
      "learning_rate": 2.374037332934512e-07,
      "loss": 0.0922,
      "reward": 0.054161038249731064,
      "reward_std": 0.7442760765552521,
      "rewards/cosine_scaled_reward": -0.2541694864630699,
      "rewards/format_reward": 0.5625000149011612,
      "step": 385
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 870.3333740234375,
      "epoch": 0.44114285714285717,
      "grad_norm": 1.4860604247340325,
      "kl": 0.0894775390625,
      "learning_rate": 2.3515149676898552e-07,
      "loss": 0.1158,
      "reward": 0.28954136464744806,
      "reward_std": 0.5479708462953568,
      "rewards/cosine_scaled_reward": -0.240646006539464,
      "rewards/format_reward": 0.770833358168602,
      "step": 386
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1227.2291870117188,
      "epoch": 0.4422857142857143,
      "grad_norm": 1.687755076974517,
      "kl": 0.2470703125,
      "learning_rate": 2.3291460551638237e-07,
      "loss": 0.151,
      "reward": -0.0012904666364192963,
      "reward_std": 0.4440325200557709,
      "rewards/cosine_scaled_reward": -0.2714785784482956,
      "rewards/format_reward": 0.5416666865348816,
      "step": 387
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1033.6042022705078,
      "epoch": 0.44342857142857145,
      "grad_norm": 2.5341444420596884,
      "kl": 0.164947509765625,
      "learning_rate": 2.306931685585657e-07,
      "loss": 0.0897,
      "reward": 0.4180222749710083,
      "reward_std": 0.754804901778698,
      "rewards/cosine_scaled_reward": -0.14515553694218397,
      "rewards/format_reward": 0.7083333432674408,
      "step": 388
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1356.8333740234375,
      "epoch": 0.44457142857142856,
      "grad_norm": 3.704344948231344,
      "kl": 0.372314453125,
      "learning_rate": 2.2848729416523859e-07,
      "loss": 0.102,
      "reward": 0.2806839719414711,
      "reward_std": 0.6125510483980179,
      "rewards/cosine_scaled_reward": -0.07840801030397415,
      "rewards/format_reward": 0.4375000074505806,
      "step": 389
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1090.2708740234375,
      "epoch": 0.44571428571428573,
      "grad_norm": 14.470921296685399,
      "kl": 0.47216796875,
      "learning_rate": 2.2629708984760706e-07,
      "loss": 0.2654,
      "reward": 0.07703178748488426,
      "reward_std": 0.5665107443928719,
      "rewards/cosine_scaled_reward": -0.26356743834912777,
      "rewards/format_reward": 0.6041666939854622,
      "step": 390
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1098.7500305175781,
      "epoch": 0.44685714285714284,
      "grad_norm": 2.4001916122615157,
      "kl": 0.26904296875,
      "learning_rate": 2.2412266235313973e-07,
      "loss": 0.1304,
      "reward": 0.2017030455172062,
      "reward_std": 0.5325312875211239,
      "rewards/cosine_scaled_reward": -0.20123182306997478,
      "rewards/format_reward": 0.6041666865348816,
      "step": 391
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1404.0625610351562,
      "epoch": 0.448,
      "grad_norm": 12.93850484473414,
      "kl": 0.662109375,
      "learning_rate": 2.2196411766036487e-07,
      "loss": 0.0663,
      "reward": 0.39279897045344114,
      "reward_std": 0.9181084930896759,
      "rewards/cosine_scaled_reward": -0.04318385384976864,
      "rewards/format_reward": 0.4791666939854622,
      "step": 392
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1046.0417175292969,
      "epoch": 0.4491428571428571,
      "grad_norm": 3.1910943036863695,
      "kl": 0.2236328125,
      "learning_rate": 2.1982156097370557e-07,
      "loss": 0.094,
      "reward": 0.1259294361807406,
      "reward_std": 0.620373547077179,
      "rewards/cosine_scaled_reward": -0.23911861330270767,
      "rewards/format_reward": 0.6041666865348816,
      "step": 393
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 898.3333511352539,
      "epoch": 0.4502857142857143,
      "grad_norm": 4.93057169428389,
      "kl": 0.25714111328125,
      "learning_rate": 2.1769509671835223e-07,
      "loss": 0.2665,
      "reward": 0.2223543766885996,
      "reward_std": 0.4368506968021393,
      "rewards/cosine_scaled_reward": -0.23257281631231308,
      "rewards/format_reward": 0.6875000149011612,
      "step": 394
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1140.0000457763672,
      "epoch": 0.4514285714285714,
      "grad_norm": 2.3738396662205945,
      "kl": 0.35986328125,
      "learning_rate": 2.1558482853517253e-07,
      "loss": 0.3105,
      "reward": 0.10918148793280125,
      "reward_std": 0.5202281884849072,
      "rewards/cosine_scaled_reward": -0.21624258160591125,
      "rewards/format_reward": 0.5416666865348816,
      "step": 395
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1151.3958740234375,
      "epoch": 0.45257142857142857,
      "grad_norm": 2.5367764499763257,
      "kl": 0.3154296875,
      "learning_rate": 2.134908592756607e-07,
      "loss": 0.1917,
      "reward": 0.17909681051969528,
      "reward_std": 0.7349686250090599,
      "rewards/cosine_scaled_reward": -0.2021182719618082,
      "rewards/format_reward": 0.5833333432674408,
      "step": 396
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1223.8958740234375,
      "epoch": 0.45371428571428574,
      "grad_norm": 3.0861426217577645,
      "kl": 0.38720703125,
      "learning_rate": 2.1141329099692406e-07,
      "loss": 0.2308,
      "reward": 0.6319128852337599,
      "reward_std": 0.8242618143558502,
      "rewards/cosine_scaled_reward": 0.04512310400605202,
      "rewards/format_reward": 0.541666679084301,
      "step": 397
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1218.8541870117188,
      "epoch": 0.45485714285714285,
      "grad_norm": 18.365837770437405,
      "kl": 0.6829833984375,
      "learning_rate": 2.0935222495670968e-07,
      "loss": 0.189,
      "reward": 0.27588833356276155,
      "reward_std": 0.8127910792827606,
      "rewards/cosine_scaled_reward": -0.19538918882608414,
      "rewards/format_reward": 0.6666666939854622,
      "step": 398
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1264.1666870117188,
      "epoch": 0.456,
      "grad_norm": 3.8049582826738373,
      "kl": 0.47314453125,
      "learning_rate": 2.0730776160846853e-07,
      "loss": 0.1823,
      "reward": 0.055698491632938385,
      "reward_std": 0.49411067366600037,
      "rewards/cosine_scaled_reward": -0.21173409838229418,
      "rewards/format_reward": 0.4791666828095913,
      "step": 399
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1013.3750457763672,
      "epoch": 0.45714285714285713,
      "grad_norm": 7.251771375036044,
      "kl": 0.4078369140625,
      "learning_rate": 2.0528000059645995e-07,
      "loss": 0.175,
      "reward": 0.2562308683991432,
      "reward_std": 0.2563706263899803,
      "rewards/cosine_scaled_reward": -0.2260512337088585,
      "rewards/format_reward": 0.7083333488553762,
      "step": 400
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 952.7291870117188,
      "epoch": 0.4582857142857143,
      "grad_norm": 8.82258461767532,
      "kl": 0.45355224609375,
      "learning_rate": 2.032690407508949e-07,
      "loss": 0.1529,
      "reward": 0.4902263447875157,
      "reward_std": 0.5446355119347572,
      "rewards/cosine_scaled_reward": -0.11947017908096313,
      "rewards/format_reward": 0.729166679084301,
      "step": 401
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1302.8542175292969,
      "epoch": 0.4594285714285714,
      "grad_norm": 9.144934630730456,
      "kl": 0.51953125,
      "learning_rate": 2.0127498008311922e-07,
      "loss": 0.1489,
      "reward": 0.0001004636287689209,
      "reward_std": 0.5631029531359673,
      "rewards/cosine_scaled_reward": -0.28119976818561554,
      "rewards/format_reward": 0.5625000223517418,
      "step": 402
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1430.5833740234375,
      "epoch": 0.4605714285714286,
      "grad_norm": 1.9477748820622875,
      "kl": 0.3515625,
      "learning_rate": 1.9929791578083655e-07,
      "loss": 0.2408,
      "reward": -0.06413780152797699,
      "reward_std": 0.7934899777173996,
      "rewards/cosine_scaled_reward": -0.2195689007639885,
      "rewards/format_reward": 0.3750000037252903,
      "step": 403
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1262.8333587646484,
      "epoch": 0.4617142857142857,
      "grad_norm": 3.330875199108497,
      "kl": 0.19140625,
      "learning_rate": 1.9733794420337213e-07,
      "loss": 0.1344,
      "reward": 0.1329102972522378,
      "reward_std": 0.5511343032121658,
      "rewards/cosine_scaled_reward": -0.25646152906119823,
      "rewards/format_reward": 0.645833358168602,
      "step": 404
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1030.2916870117188,
      "epoch": 0.46285714285714286,
      "grad_norm": 7.2006501333137996,
      "kl": 0.147216796875,
      "learning_rate": 1.9539516087697517e-07,
      "loss": 0.1614,
      "reward": 0.41257511638104916,
      "reward_std": 0.4603617787361145,
      "rewards/cosine_scaled_reward": -0.14787913113832474,
      "rewards/format_reward": 0.7083333432674408,
      "step": 405
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1085.1666870117188,
      "epoch": 0.464,
      "grad_norm": 1.7990135612572722,
      "kl": 0.25146484375,
      "learning_rate": 1.934696604901642e-07,
      "loss": 0.1199,
      "reward": -0.0262349434196949,
      "reward_std": 0.4924147129058838,
      "rewards/cosine_scaled_reward": -0.2839508093893528,
      "rewards/format_reward": 0.5416666828095913,
      "step": 406
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 953.4375,
      "epoch": 0.46514285714285714,
      "grad_norm": 2.205384781012483,
      "kl": 0.16357421875,
      "learning_rate": 1.915615368891117e-07,
      "loss": 0.0901,
      "reward": 0.5169772207736969,
      "reward_std": 0.28926569409668446,
      "rewards/cosine_scaled_reward": -0.0748447310179472,
      "rewards/format_reward": 0.6666666828095913,
      "step": 407
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 914.9583587646484,
      "epoch": 0.4662857142857143,
      "grad_norm": 2.497956913876472,
      "kl": 0.27496337890625,
      "learning_rate": 1.8967088307307e-07,
      "loss": 0.1155,
      "reward": 0.3262156348209828,
      "reward_std": 0.6255160942673683,
      "rewards/cosine_scaled_reward": -0.13897553086280823,
      "rewards/format_reward": 0.6041666716337204,
      "step": 408
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1040.8333435058594,
      "epoch": 0.4674285714285714,
      "grad_norm": 8.69885706641019,
      "kl": 0.2950439453125,
      "learning_rate": 1.8779779118983867e-07,
      "loss": 0.1446,
      "reward": 0.45548180863261223,
      "reward_std": 0.683892697095871,
      "rewards/cosine_scaled_reward": -0.1472591133788228,
      "rewards/format_reward": 0.7500000149011612,
      "step": 409
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1113.8750305175781,
      "epoch": 0.4685714285714286,
      "grad_norm": 2.827095364325863,
      "kl": 0.17364501953125,
      "learning_rate": 1.8594235253127372e-07,
      "loss": 0.1365,
      "reward": -0.055647075176239014,
      "reward_std": 0.5701718181371689,
      "rewards/cosine_scaled_reward": -0.3299068883061409,
      "rewards/format_reward": 0.6041666939854622,
      "step": 410
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1180.7500305175781,
      "epoch": 0.4697142857142857,
      "grad_norm": 6.312691045251246,
      "kl": 0.2171630859375,
      "learning_rate": 1.8410465752883758e-07,
      "loss": 0.26,
      "reward": -0.027378916274756193,
      "reward_std": 0.5135050415992737,
      "rewards/cosine_scaled_reward": -0.33660613000392914,
      "rewards/format_reward": 0.6458333432674408,
      "step": 411
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 936.0625152587891,
      "epoch": 0.47085714285714286,
      "grad_norm": 8.466457070247934,
      "kl": 0.207763671875,
      "learning_rate": 1.822847957491922e-07,
      "loss": 0.2152,
      "reward": 0.2903781367931515,
      "reward_std": 0.6151079386472702,
      "rewards/cosine_scaled_reward": -0.24022759683430195,
      "rewards/format_reward": 0.770833358168602,
      "step": 412
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1127.9375610351562,
      "epoch": 0.472,
      "grad_norm": 7.005816452720984,
      "kl": 0.23388671875,
      "learning_rate": 1.804828558898332e-07,
      "loss": 0.2359,
      "reward": -0.05256163072772324,
      "reward_std": 0.5086416229605675,
      "rewards/cosine_scaled_reward": -0.30753082782030106,
      "rewards/format_reward": 0.5625000223517418,
      "step": 413
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1254.2292175292969,
      "epoch": 0.47314285714285714,
      "grad_norm": 3.1930529627345146,
      "kl": 0.30908203125,
      "learning_rate": 1.7869892577476722e-07,
      "loss": 0.091,
      "reward": 0.27630291134119034,
      "reward_std": 0.601336345076561,
      "rewards/cosine_scaled_reward": -0.12226520664989948,
      "rewards/format_reward": 0.520833358168602,
      "step": 414
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1198.7083740234375,
      "epoch": 0.4742857142857143,
      "grad_norm": 1.9203274121236615,
      "kl": 0.27783203125,
      "learning_rate": 1.7693309235023127e-07,
      "loss": 0.1839,
      "reward": 0.15045135095715523,
      "reward_std": 0.8359555453062057,
      "rewards/cosine_scaled_reward": -0.21644099615514278,
      "rewards/format_reward": 0.5833333507180214,
      "step": 415
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1303.2708435058594,
      "epoch": 0.4754285714285714,
      "grad_norm": 5.219130595783076,
      "kl": 0.288330078125,
      "learning_rate": 1.7518544168045524e-07,
      "loss": 0.2384,
      "reward": 0.06198018416762352,
      "reward_std": 0.7209452688694,
      "rewards/cosine_scaled_reward": -0.2502599246799946,
      "rewards/format_reward": 0.5625000149011612,
      "step": 416
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1104.3958740234375,
      "epoch": 0.4765714285714286,
      "grad_norm": 343.4311543801194,
      "kl": 3.455078125,
      "learning_rate": 1.7345605894346726e-07,
      "loss": 0.3667,
      "reward": 0.25671100057661533,
      "reward_std": 0.5841851308941841,
      "rewards/cosine_scaled_reward": -0.19456118065863848,
      "rewards/format_reward": 0.645833358168602,
      "step": 417
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1071.9375305175781,
      "epoch": 0.4777142857142857,
      "grad_norm": 3.5739561302927703,
      "kl": 0.18438720703125,
      "learning_rate": 1.7174502842694212e-07,
      "loss": 0.0318,
      "reward": 0.18263494968414307,
      "reward_std": 0.688008576631546,
      "rewards/cosine_scaled_reward": -0.25243253633379936,
      "rewards/format_reward": 0.6875000149011612,
      "step": 418
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1059.2500457763672,
      "epoch": 0.47885714285714287,
      "grad_norm": 42.82614000306872,
      "kl": 14.88720703125,
      "learning_rate": 1.7005243352409333e-07,
      "loss": 0.182,
      "reward": 0.10820261249318719,
      "reward_std": 0.658612459897995,
      "rewards/cosine_scaled_reward": -0.24798204004764557,
      "rewards/format_reward": 0.6041666716337204,
      "step": 419
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1066.7917022705078,
      "epoch": 0.48,
      "grad_norm": 7.563689623131912,
      "kl": 0.54296875,
      "learning_rate": 1.6837835672960831e-07,
      "loss": 0.1366,
      "reward": 0.24830662203021348,
      "reward_std": 0.6641267538070679,
      "rewards/cosine_scaled_reward": -0.19876337423920631,
      "rewards/format_reward": 0.6458333432674408,
      "step": 420
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1140.8958740234375,
      "epoch": 0.48114285714285715,
      "grad_norm": 5.102712434876203,
      "kl": 0.455322265625,
      "learning_rate": 1.6672287963562852e-07,
      "loss": 0.238,
      "reward": 0.22175164567306638,
      "reward_std": 0.48806294053792953,
      "rewards/cosine_scaled_reward": -0.19120752811431885,
      "rewards/format_reward": 0.6041666865348816,
      "step": 421
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1181.3333587646484,
      "epoch": 0.48228571428571426,
      "grad_norm": 11.187728016893017,
      "kl": 0.7470703125,
      "learning_rate": 1.6508608292777203e-07,
      "loss": 0.2428,
      "reward": 0.016264647245407104,
      "reward_std": 0.7520715892314911,
      "rewards/cosine_scaled_reward": -0.27311767637729645,
      "rewards/format_reward": 0.5625000149011612,
      "step": 422
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1151.5625305175781,
      "epoch": 0.48342857142857143,
      "grad_norm": 36.484656907353894,
      "kl": 1.12109375,
      "learning_rate": 1.6346804638120098e-07,
      "loss": 0.225,
      "reward": 0.166658578440547,
      "reward_std": 0.5137820392847061,
      "rewards/cosine_scaled_reward": -0.20833738893270493,
      "rewards/format_reward": 0.5833333432674408,
      "step": 423
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1222.4167022705078,
      "epoch": 0.4845714285714286,
      "grad_norm": 5.314021913144739,
      "kl": 0.468994140625,
      "learning_rate": 1.6186884885673413e-07,
      "loss": 0.0791,
      "reward": -0.053052062867209315,
      "reward_std": 0.5032695159316063,
      "rewards/cosine_scaled_reward": -0.349442720413208,
      "rewards/format_reward": 0.6458333507180214,
      "step": 424
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1303.0417022705078,
      "epoch": 0.4857142857142857,
      "grad_norm": 15.439357915372184,
      "kl": 0.76171875,
      "learning_rate": 1.6028856829700258e-07,
      "loss": 0.1567,
      "reward": 0.06288054899778217,
      "reward_std": 0.8221424967050552,
      "rewards/cosine_scaled_reward": -0.24980972707271576,
      "rewards/format_reward": 0.5625000111758709,
      "step": 425
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1363.2292175292969,
      "epoch": 0.4868571428571429,
      "grad_norm": 16.190560753791,
      "kl": 0.64306640625,
      "learning_rate": 1.5872728172265146e-07,
      "loss": 0.2057,
      "reward": 0.0070614293217659,
      "reward_std": 0.8801029026508331,
      "rewards/cosine_scaled_reward": -0.18396929651498795,
      "rewards/format_reward": 0.3750000111758709,
      "step": 426
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1062.6250457763672,
      "epoch": 0.488,
      "grad_norm": 5.208018104302035,
      "kl": 0.289306640625,
      "learning_rate": 1.5718506522858572e-07,
      "loss": 0.2392,
      "reward": 0.1040960568934679,
      "reward_std": 0.7021225243806839,
      "rewards/cosine_scaled_reward": -0.21878531202673912,
      "rewards/format_reward": 0.5416666865348816,
      "step": 427
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1168.6250457763672,
      "epoch": 0.48914285714285716,
      "grad_norm": 1.7936384513629215,
      "kl": 0.194366455078125,
      "learning_rate": 1.5566199398026147e-07,
      "loss": 0.1231,
      "reward": 0.1094297245144844,
      "reward_std": 0.5426923930644989,
      "rewards/cosine_scaled_reward": -0.247368473559618,
      "rewards/format_reward": 0.6041666716337204,
      "step": 428
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1089.6875305175781,
      "epoch": 0.49028571428571427,
      "grad_norm": 3.242866515089598,
      "kl": 0.18408203125,
      "learning_rate": 1.5415814221002265e-07,
      "loss": 0.1081,
      "reward": 0.4839252680540085,
      "reward_std": 0.5947171896696091,
      "rewards/cosine_scaled_reward": -0.03928736597299576,
      "rewards/format_reward": 0.5625000260770321,
      "step": 429
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1313.5833587646484,
      "epoch": 0.49142857142857144,
      "grad_norm": 1.478054069262014,
      "kl": 0.21612548828125,
      "learning_rate": 1.5267358321348285e-07,
      "loss": 0.1273,
      "reward": 0.15572084113955498,
      "reward_std": 0.5618212074041367,
      "rewards/cosine_scaled_reward": -0.18255625164601952,
      "rewards/format_reward": 0.5208333358168602,
      "step": 430
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 975.4791870117188,
      "epoch": 0.49257142857142855,
      "grad_norm": 3.541465585724065,
      "kl": 0.1605224609375,
      "learning_rate": 1.5120838934595337e-07,
      "loss": 0.1148,
      "reward": 0.420807933434844,
      "reward_std": 0.890654593706131,
      "rewards/cosine_scaled_reward": -0.11251270584762096,
      "rewards/format_reward": 0.6458333507180214,
      "step": 431
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1181.062515258789,
      "epoch": 0.4937142857142857,
      "grad_norm": 3.350973639300781,
      "kl": 0.155517578125,
      "learning_rate": 1.4976263201891613e-07,
      "loss": 0.1027,
      "reward": 0.032605723943561316,
      "reward_std": 0.5731803774833679,
      "rewards/cosine_scaled_reward": -0.2753637991845608,
      "rewards/format_reward": 0.5833333507180214,
      "step": 432
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1207.0417175292969,
      "epoch": 0.4948571428571429,
      "grad_norm": 4.990349151202906,
      "kl": 0.185546875,
      "learning_rate": 1.483363816965435e-07,
      "loss": 0.1393,
      "reward": 0.08886189805343747,
      "reward_std": 0.4594448246061802,
      "rewards/cosine_scaled_reward": -0.23681906727142632,
      "rewards/format_reward": 0.5625000298023224,
      "step": 433
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 859.1250457763672,
      "epoch": 0.496,
      "grad_norm": 1.9877951359345267,
      "kl": 0.17950439453125,
      "learning_rate": 1.469297078922642e-07,
      "loss": 0.0512,
      "reward": 1.2721150815486908,
      "reward_std": 0.6770742386579514,
      "rewards/cosine_scaled_reward": 0.20897419564425945,
      "rewards/format_reward": 0.8541666716337204,
      "step": 434
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1266.375015258789,
      "epoch": 0.49714285714285716,
      "grad_norm": 1.8972601097369153,
      "kl": 0.2255859375,
      "learning_rate": 1.4554267916537495e-07,
      "loss": 0.1234,
      "reward": 0.10697830189019442,
      "reward_std": 0.531020175665617,
      "rewards/cosine_scaled_reward": -0.22776086255908012,
      "rewards/format_reward": 0.5625000149011612,
      "step": 435
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1315.5625305175781,
      "epoch": 0.4982857142857143,
      "grad_norm": 2.490164316553904,
      "kl": 0.2635498046875,
      "learning_rate": 1.4417536311769885e-07,
      "loss": 0.1196,
      "reward": -0.10972822457551956,
      "reward_std": 0.5596715956926346,
      "rewards/cosine_scaled_reward": -0.2840307876467705,
      "rewards/format_reward": 0.4583333395421505,
      "step": 436
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1060.5208587646484,
      "epoch": 0.49942857142857144,
      "grad_norm": 1.9387158266765225,
      "kl": 0.294189453125,
      "learning_rate": 1.4282782639029128e-07,
      "loss": 0.0174,
      "reward": 0.52107123285532,
      "reward_std": 0.5726887807250023,
      "rewards/cosine_scaled_reward": -0.05196441989392042,
      "rewards/format_reward": 0.6250000298023224,
      "step": 437
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 921.3125457763672,
      "epoch": 0.5005714285714286,
      "grad_norm": 8.654811309244227,
      "kl": 0.2535400390625,
      "learning_rate": 1.4150013466019114e-07,
      "loss": 0.1354,
      "reward": 0.20009983237832785,
      "reward_std": 0.6868909299373627,
      "rewards/cosine_scaled_reward": -0.20203341665910557,
      "rewards/format_reward": 0.6041666865348816,
      "step": 438
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1083.0833740234375,
      "epoch": 0.5017142857142857,
      "grad_norm": 5.3889889905872375,
      "kl": 0.40216064453125,
      "learning_rate": 1.4019235263722034e-07,
      "loss": 0.2461,
      "reward": 0.11843711510300636,
      "reward_std": 0.5985070914030075,
      "rewards/cosine_scaled_reward": -0.2636981066316366,
      "rewards/format_reward": 0.6458333432674408,
      "step": 439
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1088.8750305175781,
      "epoch": 0.5028571428571429,
      "grad_norm": 4.149099334589977,
      "kl": 0.30615234375,
      "learning_rate": 1.3890454406082956e-07,
      "loss": 0.033,
      "reward": 0.11394692957401276,
      "reward_std": 0.6579174622893333,
      "rewards/cosine_scaled_reward": -0.24510987009853125,
      "rewards/format_reward": 0.6041666865348816,
      "step": 440
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 984.3125610351562,
      "epoch": 0.504,
      "grad_norm": 16.782102815445953,
      "kl": 0.367919921875,
      "learning_rate": 1.3763677169699217e-07,
      "loss": 0.0977,
      "reward": 0.3985663428902626,
      "reward_std": 0.42315196245908737,
      "rewards/cosine_scaled_reward": -0.134050190448761,
      "rewards/format_reward": 0.6666666828095913,
      "step": 441
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 988.4583587646484,
      "epoch": 0.5051428571428571,
      "grad_norm": 4.106390753028162,
      "kl": 0.31378173828125,
      "learning_rate": 1.3638909733514452e-07,
      "loss": 0.0056,
      "reward": 0.13055693171918392,
      "reward_std": 0.48535653203725815,
      "rewards/cosine_scaled_reward": -0.27847154438495636,
      "rewards/format_reward": 0.6875000223517418,
      "step": 442
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1004.3750152587891,
      "epoch": 0.5062857142857143,
      "grad_norm": 8.08171445493757,
      "kl": 0.16259765625,
      "learning_rate": 1.351615817851748e-07,
      "loss": 0.2301,
      "reward": 0.30977149307727814,
      "reward_std": 0.6895428746938705,
      "rewards/cosine_scaled_reward": -0.18886426091194153,
      "rewards/format_reward": 0.6875000298023224,
      "step": 443
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1087.9375305175781,
      "epoch": 0.5074285714285715,
      "grad_norm": 3.849891000062917,
      "kl": 0.1669921875,
      "learning_rate": 1.3395428487445914e-07,
      "loss": 0.0975,
      "reward": 0.4580417312681675,
      "reward_std": 0.640699241310358,
      "rewards/cosine_scaled_reward": -0.14597914181649685,
      "rewards/format_reward": 0.7500000298023224,
      "step": 444
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1224.2500457763672,
      "epoch": 0.5085714285714286,
      "grad_norm": 23.59411548899569,
      "kl": 0.82958984375,
      "learning_rate": 1.3276726544494571e-07,
      "loss": 0.2131,
      "reward": 0.051861570216715336,
      "reward_std": 0.6278680041432381,
      "rewards/cosine_scaled_reward": -0.2553192190825939,
      "rewards/format_reward": 0.5625,
      "step": 445
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1067.9791870117188,
      "epoch": 0.5097142857142857,
      "grad_norm": 3.656936199785778,
      "kl": 0.24365234375,
      "learning_rate": 1.316005813502869e-07,
      "loss": 0.0234,
      "reward": 0.36385649256408215,
      "reward_std": 0.7834623008966446,
      "rewards/cosine_scaled_reward": -0.1722384188324213,
      "rewards/format_reward": 0.7083333432674408,
      "step": 446
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1174.7083740234375,
      "epoch": 0.5108571428571429,
      "grad_norm": 2.1759216078948036,
      "kl": 0.3828125,
      "learning_rate": 1.3045428945301953e-07,
      "loss": 0.2194,
      "reward": 0.23982627410441637,
      "reward_std": 0.5332969650626183,
      "rewards/cosine_scaled_reward": -0.21342020854353905,
      "rewards/format_reward": 0.6666666865348816,
      "step": 447
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1180.2083740234375,
      "epoch": 0.512,
      "grad_norm": 4.2366039265569135,
      "kl": 0.31494140625,
      "learning_rate": 1.2932844562179352e-07,
      "loss": 0.1963,
      "reward": 0.3762773834168911,
      "reward_std": 0.6801744475960732,
      "rewards/cosine_scaled_reward": -0.14519466273486614,
      "rewards/format_reward": 0.666666679084301,
      "step": 448
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 994.3958740234375,
      "epoch": 0.5131428571428571,
      "grad_norm": 4.146583173336839,
      "kl": 0.148681640625,
      "learning_rate": 1.2822310472864885e-07,
      "loss": 0.1922,
      "reward": 0.36078188568353653,
      "reward_std": 0.737194113433361,
      "rewards/cosine_scaled_reward": -0.15294241392984986,
      "rewards/format_reward": 0.6666667014360428,
      "step": 449
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 851.8958587646484,
      "epoch": 0.5142857142857142,
      "grad_norm": 51.8228987068238,
      "kl": 0.534942626953125,
      "learning_rate": 1.2713832064634125e-07,
      "loss": 0.1269,
      "reward": 0.5865043960511684,
      "reward_std": 0.4706997238099575,
      "rewards/cosine_scaled_reward": -0.10258114710450172,
      "rewards/format_reward": 0.7916666865348816,
      "step": 450
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1079.7916870117188,
      "epoch": 0.5154285714285715,
      "grad_norm": 6.392599999015184,
      "kl": 0.2735595703125,
      "learning_rate": 1.260741462457165e-07,
      "loss": 0.2626,
      "reward": 0.22280075028538704,
      "reward_std": 0.6088056340813637,
      "rewards/cosine_scaled_reward": -0.18026629835367203,
      "rewards/format_reward": 0.5833333395421505,
      "step": 451
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1046.8542022705078,
      "epoch": 0.5165714285714286,
      "grad_norm": 8.715599338320725,
      "kl": 0.152099609375,
      "learning_rate": 1.2503063339313356e-07,
      "loss": 0.2189,
      "reward": -0.0023173224180936813,
      "reward_std": 0.5100973732769489,
      "rewards/cosine_scaled_reward": -0.3136586770415306,
      "rewards/format_reward": 0.6250000298023224,
      "step": 452
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1271.9375305175781,
      "epoch": 0.5177142857142857,
      "grad_norm": 2.4908553038859917,
      "kl": 0.40869140625,
      "learning_rate": 1.2400783294793668e-07,
      "loss": 0.1805,
      "reward": 0.027155719697475433,
      "reward_std": 0.5863115191459656,
      "rewards/cosine_scaled_reward": -0.23642215505242348,
      "rewards/format_reward": 0.5000000149011612,
      "step": 453
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1104.6667175292969,
      "epoch": 0.5188571428571429,
      "grad_norm": 42.815024473876115,
      "kl": 2.04296875,
      "learning_rate": 1.2300579475997657e-07,
      "loss": 0.039,
      "reward": 0.2878073714673519,
      "reward_std": 0.6589629650115967,
      "rewards/cosine_scaled_reward": -0.13734631799161434,
      "rewards/format_reward": 0.5625000223517418,
      "step": 454
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1183.250015258789,
      "epoch": 0.52,
      "grad_norm": 2.6108705721314824,
      "kl": 0.306640625,
      "learning_rate": 1.220245676671809e-07,
      "loss": 0.1199,
      "reward": 0.3790533752180636,
      "reward_std": 0.4862861856818199,
      "rewards/cosine_scaled_reward": -0.13338997215032578,
      "rewards/format_reward": 0.645833358168602,
      "step": 455
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1168.4375305175781,
      "epoch": 0.5211428571428571,
      "grad_norm": 25.701940158931713,
      "kl": 0.477294921875,
      "learning_rate": 1.2106419949317388e-07,
      "loss": 0.1681,
      "reward": 0.2994745699688792,
      "reward_std": 0.7066301554441452,
      "rewards/cosine_scaled_reward": -0.15234605269506574,
      "rewards/format_reward": 0.6041666865348816,
      "step": 456
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1007.4167022705078,
      "epoch": 0.5222857142857142,
      "grad_norm": 8.570796851314613,
      "kl": 0.186279296875,
      "learning_rate": 1.2012473704494537e-07,
      "loss": 0.3132,
      "reward": 0.400404367595911,
      "reward_std": 0.5747000873088837,
      "rewards/cosine_scaled_reward": -0.15396450087428093,
      "rewards/format_reward": 0.708333358168602,
      "step": 457
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1272.8958435058594,
      "epoch": 0.5234285714285715,
      "grad_norm": 4.345232068890798,
      "kl": 0.48974609375,
      "learning_rate": 1.1920622611056974e-07,
      "loss": 0.2715,
      "reward": 0.2525772713124752,
      "reward_std": 0.8047986179590225,
      "rewards/cosine_scaled_reward": -0.12371136248111725,
      "rewards/format_reward": 0.5000000298023224,
      "step": 458
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1063.625015258789,
      "epoch": 0.5245714285714286,
      "grad_norm": 5.1821320020363615,
      "kl": 0.15771484375,
      "learning_rate": 1.1830871145697412e-07,
      "loss": 0.1275,
      "reward": 0.02093285135924816,
      "reward_std": 0.42146630585193634,
      "rewards/cosine_scaled_reward": -0.3124502506107092,
      "rewards/format_reward": 0.645833358168602,
      "step": 459
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1044.2708435058594,
      "epoch": 0.5257142857142857,
      "grad_norm": 3.3010395921201514,
      "kl": 0.267333984375,
      "learning_rate": 1.1743223682775649e-07,
      "loss": 0.1046,
      "reward": 0.2678923445455439,
      "reward_std": 0.896328404545784,
      "rewards/cosine_scaled_reward": -0.12647049874067307,
      "rewards/format_reward": 0.5208333656191826,
      "step": 460
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1053.4583587646484,
      "epoch": 0.5268571428571428,
      "grad_norm": 1.478075619341315,
      "kl": 0.21600341796875,
      "learning_rate": 1.1657684494105386e-07,
      "loss": 0.0555,
      "reward": 0.3391416594386101,
      "reward_std": 0.9088789522647858,
      "rewards/cosine_scaled_reward": -0.20542917400598526,
      "rewards/format_reward": 0.7500000223517418,
      "step": 461
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1103.2083740234375,
      "epoch": 0.528,
      "grad_norm": 9530.342121672113,
      "kl": 28.46978759765625,
      "learning_rate": 1.1574257748745986e-07,
      "loss": 1.3293,
      "reward": 0.14297988126054406,
      "reward_std": 0.5064843520522118,
      "rewards/cosine_scaled_reward": -0.25142673472873867,
      "rewards/format_reward": 0.645833358168602,
      "step": 462
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1043.3333587646484,
      "epoch": 0.5291428571428571,
      "grad_norm": 4.739243744092973,
      "kl": 0.39892578125,
      "learning_rate": 1.1492947512799328e-07,
      "loss": 0.2493,
      "reward": 0.6755956448614597,
      "reward_std": 0.4871959462761879,
      "rewards/cosine_scaled_reward": 0.025297801941633224,
      "rewards/format_reward": 0.6250000298023224,
      "step": 463
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 992.2916717529297,
      "epoch": 0.5302857142857142,
      "grad_norm": 122.21833055898026,
      "kl": 1.33843994140625,
      "learning_rate": 1.1413757749211602e-07,
      "loss": 0.2572,
      "reward": 0.29958341596648097,
      "reward_std": 0.8296171501278877,
      "rewards/cosine_scaled_reward": -0.20437496528029442,
      "rewards/format_reward": 0.7083333432674408,
      "step": 464
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1240.5417175292969,
      "epoch": 0.5314285714285715,
      "grad_norm": 6.845067293294205,
      "kl": 0.55908203125,
      "learning_rate": 1.1336692317580158e-07,
      "loss": 0.1986,
      "reward": 0.072305912617594,
      "reward_std": 0.4831778481602669,
      "rewards/cosine_scaled_reward": -0.2138470560312271,
      "rewards/format_reward": 0.5000000186264515,
      "step": 465
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1192.9167175292969,
      "epoch": 0.5325714285714286,
      "grad_norm": 9.93163371597492,
      "kl": 0.49163818359375,
      "learning_rate": 1.1261754973965422e-07,
      "loss": 0.0928,
      "reward": 0.04001780319958925,
      "reward_std": 0.44342009350657463,
      "rewards/cosine_scaled_reward": -0.28207441698759794,
      "rewards/format_reward": 0.6041666865348816,
      "step": 466
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1341.1667175292969,
      "epoch": 0.5337142857142857,
      "grad_norm": 19.835786495839272,
      "kl": 0.8251953125,
      "learning_rate": 1.1188949370707787e-07,
      "loss": 0.2635,
      "reward": 0.1306269969791174,
      "reward_std": 0.6591696962714195,
      "rewards/cosine_scaled_reward": -0.21593650616705418,
      "rewards/format_reward": 0.5625000298023224,
      "step": 467
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1138.7708740234375,
      "epoch": 0.5348571428571428,
      "grad_norm": 13.935934940776233,
      "kl": 0.576904296875,
      "learning_rate": 1.1118279056249653e-07,
      "loss": 0.0976,
      "reward": 0.37931894324719906,
      "reward_std": 0.5462356135249138,
      "rewards/cosine_scaled_reward": -0.10200719349086285,
      "rewards/format_reward": 0.5833333432674408,
      "step": 468
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1028.6041870117188,
      "epoch": 0.536,
      "grad_norm": 2.6967193006473216,
      "kl": 0.26171875,
      "learning_rate": 1.1049747474962444e-07,
      "loss": 0.0529,
      "reward": 0.40807172656059265,
      "reward_std": 0.6494475156068802,
      "rewards/cosine_scaled_reward": -0.13971414044499397,
      "rewards/format_reward": 0.6875000149011612,
      "step": 469
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 848.5833587646484,
      "epoch": 0.5371428571428571,
      "grad_norm": 1.7855628531087904,
      "kl": 0.1328125,
      "learning_rate": 1.0983357966978745e-07,
      "loss": 0.0217,
      "reward": 0.6918718162924051,
      "reward_std": 0.5211210399866104,
      "rewards/cosine_scaled_reward": -0.0811474658548832,
      "rewards/format_reward": 0.8541666865348816,
      "step": 470
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1070.7500457763672,
      "epoch": 0.5382857142857143,
      "grad_norm": 6.038242137859596,
      "kl": 0.169921875,
      "learning_rate": 1.0919113768029517e-07,
      "loss": 0.2149,
      "reward": 0.06172482669353485,
      "reward_std": 0.5211478099226952,
      "rewards/cosine_scaled_reward": -0.29205426201224327,
      "rewards/format_reward": 0.6458333432674408,
      "step": 471
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 842.7083435058594,
      "epoch": 0.5394285714285715,
      "grad_norm": 12.109988243714355,
      "kl": 0.2894287109375,
      "learning_rate": 1.0857018009286381e-07,
      "loss": -0.0496,
      "reward": 0.46792223304510117,
      "reward_std": 0.54752978682518,
      "rewards/cosine_scaled_reward": -0.17228887975215912,
      "rewards/format_reward": 0.8125000149011612,
      "step": 472
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1181.1875305175781,
      "epoch": 0.5405714285714286,
      "grad_norm": 6.409193674543087,
      "kl": 0.3604736328125,
      "learning_rate": 1.0797073717209013e-07,
      "loss": 0.0613,
      "reward": -0.049041745252907276,
      "reward_std": 0.5112807080149651,
      "rewards/cosine_scaled_reward": -0.2641042061150074,
      "rewards/format_reward": 0.4791666865348816,
      "step": 473
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 990.3750305175781,
      "epoch": 0.5417142857142857,
      "grad_norm": 9.264374683069418,
      "kl": 0.11328125,
      "learning_rate": 1.0739283813397639e-07,
      "loss": 0.1628,
      "reward": 0.32282854616642,
      "reward_std": 0.7814144194126129,
      "rewards/cosine_scaled_reward": -0.20316907577216625,
      "rewards/format_reward": 0.7291667014360428,
      "step": 474
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 962.4792022705078,
      "epoch": 0.5428571428571428,
      "grad_norm": 4.242696196218054,
      "kl": 0.12603759765625,
      "learning_rate": 1.068365111445064e-07,
      "loss": 0.1483,
      "reward": 0.08424473810009658,
      "reward_std": 0.48827143758535385,
      "rewards/cosine_scaled_reward": -0.28079431876540184,
      "rewards/format_reward": 0.6458333544433117,
      "step": 475
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1048.8958587646484,
      "epoch": 0.544,
      "grad_norm": 18.108937894089827,
      "kl": 0.4520263671875,
      "learning_rate": 1.063017833182728e-07,
      "loss": 0.1426,
      "reward": 0.3813807927072048,
      "reward_std": 0.6394810080528259,
      "rewards/cosine_scaled_reward": -0.05930961295962334,
      "rewards/format_reward": 0.5000000223517418,
      "step": 476
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 723.4791870117188,
      "epoch": 0.5451428571428572,
      "grad_norm": 5.68071346914076,
      "kl": 0.156005859375,
      "learning_rate": 1.0578868071715544e-07,
      "loss": 0.0836,
      "reward": 0.7284884303808212,
      "reward_std": 0.6032212525606155,
      "rewards/cosine_scaled_reward": -0.08367248624563217,
      "rewards/format_reward": 0.8958333432674408,
      "step": 477
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1013.3750305175781,
      "epoch": 0.5462857142857143,
      "grad_norm": 4.283725447191352,
      "kl": 0.0955657958984375,
      "learning_rate": 1.0529722834905125e-07,
      "loss": 0.1397,
      "reward": 0.49316432885825634,
      "reward_std": 0.45135799795389175,
      "rewards/cosine_scaled_reward": -0.12841782718896866,
      "rewards/format_reward": 0.7500000149011612,
      "step": 478
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 998.3125,
      "epoch": 0.5474285714285714,
      "grad_norm": 1.2914537281090146,
      "kl": 0.15277099609375,
      "learning_rate": 1.0482745016665526e-07,
      "loss": 0.0674,
      "reward": 0.31127920374274254,
      "reward_std": 0.6323697119951248,
      "rewards/cosine_scaled_reward": -0.16727706603705883,
      "rewards/format_reward": 0.6458333432674408,
      "step": 479
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 984.8750152587891,
      "epoch": 0.5485714285714286,
      "grad_norm": 1.5465556570908303,
      "kl": 0.077606201171875,
      "learning_rate": 1.0437936906629334e-07,
      "loss": 0.0463,
      "reward": 0.5994082670658827,
      "reward_std": 0.37920307368040085,
      "rewards/cosine_scaled_reward": -0.1169625474140048,
      "rewards/format_reward": 0.8333333432674408,
      "step": 480
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1145.3541870117188,
      "epoch": 0.5497142857142857,
      "grad_norm": 2.3612933288431535,
      "kl": 0.150390625,
      "learning_rate": 1.0395300688680625e-07,
      "loss": 0.0852,
      "reward": 0.18339010886847973,
      "reward_std": 0.6312093585729599,
      "rewards/cosine_scaled_reward": -0.1999716181308031,
      "rewards/format_reward": 0.5833333432674408,
      "step": 481
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1099.2291870117188,
      "epoch": 0.5508571428571428,
      "grad_norm": 3.3478046762143303,
      "kl": 0.1209716796875,
      "learning_rate": 1.0354838440848501e-07,
      "loss": 0.1351,
      "reward": 0.4303822033107281,
      "reward_std": 0.5441673323512077,
      "rewards/cosine_scaled_reward": -0.10772557370364666,
      "rewards/format_reward": 0.6458333488553762,
      "step": 482
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 972.8958587646484,
      "epoch": 0.552,
      "grad_norm": 2.070698520552659,
      "kl": 0.1766357421875,
      "learning_rate": 1.0316552135205837e-07,
      "loss": 0.1898,
      "reward": 0.3021550700068474,
      "reward_std": 0.6595650911331177,
      "rewards/cosine_scaled_reward": -0.244755819439888,
      "rewards/format_reward": 0.7916666865348816,
      "step": 483
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 966.4375,
      "epoch": 0.5531428571428572,
      "grad_norm": 7.4718244138049785,
      "kl": 0.11669921875,
      "learning_rate": 1.0280443637773163e-07,
      "loss": 0.1535,
      "reward": 0.5982861579395831,
      "reward_std": 0.72054024040699,
      "rewards/cosine_scaled_reward": -0.034190285950899124,
      "rewards/format_reward": 0.6666666716337204,
      "step": 484
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 850.2292022705078,
      "epoch": 0.5542857142857143,
      "grad_norm": 2.6747580946696172,
      "kl": 0.171142578125,
      "learning_rate": 1.0246514708427701e-07,
      "loss": 0.0845,
      "reward": 0.440962532768026,
      "reward_std": 0.4621984176337719,
      "rewards/cosine_scaled_reward": -0.1545187532901764,
      "rewards/format_reward": 0.7500000298023224,
      "step": 485
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1098.4791870117188,
      "epoch": 0.5554285714285714,
      "grad_norm": 5.009044167350138,
      "kl": 0.1492919921875,
      "learning_rate": 1.0214767000817596e-07,
      "loss": 0.1657,
      "reward": 0.31744778295978904,
      "reward_std": 0.8680954575538635,
      "rewards/cosine_scaled_reward": -0.15377611527219415,
      "rewards/format_reward": 0.6250000149011612,
      "step": 486
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1250.9167022705078,
      "epoch": 0.5565714285714286,
      "grad_norm": 2.168704280565103,
      "kl": 0.3330078125,
      "learning_rate": 1.0185202062281336e-07,
      "loss": 0.1075,
      "reward": 0.09373046457767487,
      "reward_std": 0.7844668254256248,
      "rewards/cosine_scaled_reward": -0.18230143561959267,
      "rewards/format_reward": 0.4583333432674408,
      "step": 487
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1182.0417175292969,
      "epoch": 0.5577142857142857,
      "grad_norm": 68.49302633471272,
      "kl": 1.04931640625,
      "learning_rate": 1.0157821333772304e-07,
      "loss": 0.2607,
      "reward": 0.0013678865507245064,
      "reward_std": 0.5483251512050629,
      "rewards/cosine_scaled_reward": -0.2805660478770733,
      "rewards/format_reward": 0.5625000149011612,
      "step": 488
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 898.8958740234375,
      "epoch": 0.5588571428571428,
      "grad_norm": 3.3267908174810983,
      "kl": 0.22412109375,
      "learning_rate": 1.013262614978859e-07,
      "loss": 0.1155,
      "reward": 0.9380166502669454,
      "reward_std": 0.38279012218117714,
      "rewards/cosine_scaled_reward": 0.10442498326301575,
      "rewards/format_reward": 0.7291666716337204,
      "step": 489
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1022.5417175292969,
      "epoch": 0.56,
      "grad_norm": 1.2964210055945644,
      "kl": 0.142425537109375,
      "learning_rate": 1.0109617738307911e-07,
      "loss": 0.1375,
      "reward": 0.1352614858187735,
      "reward_std": 0.5779989808797836,
      "rewards/cosine_scaled_reward": -0.29695259779691696,
      "rewards/format_reward": 0.7291666716337204,
      "step": 490
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1071.2708740234375,
      "epoch": 0.5611428571428572,
      "grad_norm": 5.198263303721915,
      "kl": 0.270751953125,
      "learning_rate": 1.0088797220727779e-07,
      "loss": 0.1398,
      "reward": 0.360213914886117,
      "reward_std": 0.5864584296941757,
      "rewards/cosine_scaled_reward": -0.12197639048099518,
      "rewards/format_reward": 0.6041666716337204,
      "step": 491
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1146.0208740234375,
      "epoch": 0.5622857142857143,
      "grad_norm": 25.32884427185481,
      "kl": 0.860107421875,
      "learning_rate": 1.0070165611810855e-07,
      "loss": 0.279,
      "reward": 0.3603329248726368,
      "reward_std": 0.4203804060816765,
      "rewards/cosine_scaled_reward": -0.11150021478533745,
      "rewards/format_reward": 0.583333358168602,
      "step": 492
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1079.6250457763672,
      "epoch": 0.5634285714285714,
      "grad_norm": 5.39013483275012,
      "kl": 0.4027099609375,
      "learning_rate": 1.005372381963547e-07,
      "loss": 0.2018,
      "reward": 0.24866360798478127,
      "reward_std": 0.6557547599077225,
      "rewards/cosine_scaled_reward": -0.21941821463406086,
      "rewards/format_reward": 0.6875000298023224,
      "step": 493
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1020.0417022705078,
      "epoch": 0.5645714285714286,
      "grad_norm": 39.118419014119006,
      "kl": 1.0677490234375,
      "learning_rate": 1.0039472645551372e-07,
      "loss": 0.2737,
      "reward": 0.027191074565052986,
      "reward_std": 0.4351058676838875,
      "rewards/cosine_scaled_reward": -0.3301544785499573,
      "rewards/format_reward": 0.6875000149011612,
      "step": 494
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1194.8958740234375,
      "epoch": 0.5657142857142857,
      "grad_norm": 5.938670898030579,
      "kl": 0.630859375,
      "learning_rate": 1.002741278414069e-07,
      "loss": 0.2055,
      "reward": 0.374758190009743,
      "reward_std": 0.6815578863024712,
      "rewards/cosine_scaled_reward": -0.09387091733515263,
      "rewards/format_reward": 0.5625000298023224,
      "step": 495
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1120.3542022705078,
      "epoch": 0.5668571428571428,
      "grad_norm": 16.235625518016562,
      "kl": 0.50927734375,
      "learning_rate": 1.0017544823184055e-07,
      "loss": 0.297,
      "reward": 0.40772235160693526,
      "reward_std": 0.8966069668531418,
      "rewards/cosine_scaled_reward": -0.09822217002511024,
      "rewards/format_reward": 0.6041666865348816,
      "step": 496
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 827.4791870117188,
      "epoch": 0.568,
      "grad_norm": 6.892460021170429,
      "kl": 6.8536376953125,
      "learning_rate": 1.0009869243631952e-07,
      "loss": 0.2026,
      "reward": 0.8302161321043968,
      "reward_std": 0.560060553252697,
      "rewards/cosine_scaled_reward": 0.06094140186905861,
      "rewards/format_reward": 0.708333358168602,
      "step": 497
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1077.1250305175781,
      "epoch": 0.5691428571428572,
      "grad_norm": 10.255398655040155,
      "kl": 0.54931640625,
      "learning_rate": 1.000438641958131e-07,
      "loss": 0.2299,
      "reward": 0.033572545275092125,
      "reward_std": 0.4632219597697258,
      "rewards/cosine_scaled_reward": -0.30613040924072266,
      "rewards/format_reward": 0.645833358168602,
      "step": 498
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1336.7083435058594,
      "epoch": 0.5702857142857143,
      "grad_norm": 27.33692044026225,
      "kl": 0.934326171875,
      "learning_rate": 1.0001096618257236e-07,
      "loss": 0.1642,
      "reward": -0.13300850987434387,
      "reward_std": 0.6832303777337074,
      "rewards/cosine_scaled_reward": -0.28525424748659134,
      "rewards/format_reward": 0.4375000149011612,
      "step": 499
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1017.9375305175781,
      "epoch": 0.5714285714285714,
      "grad_norm": 2.4162040111238334,
      "kl": 0.2301025390625,
      "learning_rate": 1e-07,
      "loss": 0.1131,
      "reward": 0.13043908029794693,
      "reward_std": 0.5788910314440727,
      "rewards/cosine_scaled_reward": -0.29936380684375763,
      "rewards/format_reward": 0.729166679084301,
      "step": 500
    },
    {
      "epoch": 0.5714285714285714,
      "step": 500,
      "total_flos": 0.0,
      "train_loss": 0.7532739232839085,
      "train_runtime": 13678.504,
      "train_samples_per_second": 1.755,
      "train_steps_per_second": 0.037
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}