{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.2857142857142857,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 3140.2083435058594,
      "epoch": 0.0005714285714285715,
      "grad_norm": 0.18733426928520203,
      "kl": 0.1341552734375,
      "learning_rate": 0.0,
      "loss": -0.0125,
      "reward": 0.13575429469347,
      "reward_std": 0.2010277360677719,
      "rewards/cosine_scaled_reward": -0.057122852653265,
      "rewards/format_reward": 0.25,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3231.1666870117188,
      "epoch": 0.001142857142857143,
      "grad_norm": 0.3353370130062103,
      "kl": 0.05010986328125,
      "learning_rate": 2e-08,
      "loss": 0.1382,
      "reward": -0.5267819836735725,
      "reward_std": 0.38683023303747177,
      "rewards/cosine_scaled_reward": -0.36755766719579697,
      "rewards/format_reward": 0.2083333358168602,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3376.1250610351562,
      "epoch": 0.0017142857142857142,
      "grad_norm": 0.29977869987487793,
      "kl": 0.0494384765625,
      "learning_rate": 4e-08,
      "loss": 0.0883,
      "reward": -0.0896148718893528,
      "reward_std": 0.8756385631859303,
      "rewards/cosine_scaled_reward": -0.12814077525399625,
      "rewards/format_reward": 0.1666666716337204,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3431.9583740234375,
      "epoch": 0.002285714285714286,
      "grad_norm": 0.1965445876121521,
      "kl": 0.0457763671875,
      "learning_rate": 6e-08,
      "loss": 0.0477,
      "reward": -0.19004566967487335,
      "reward_std": 0.6523252762854099,
      "rewards/cosine_scaled_reward": -0.17835617810487747,
      "rewards/format_reward": 0.1666666679084301,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3436.8333740234375,
      "epoch": 0.002857142857142857,
      "grad_norm": 0.20483434200286865,
      "kl": 0.060546875,
      "learning_rate": 8e-08,
      "loss": 0.0513,
      "reward": -0.4698427654802799,
      "reward_std": 0.36434993892908096,
      "rewards/cosine_scaled_reward": -0.2974213883280754,
      "rewards/format_reward": 0.125,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3070.0416870117188,
      "epoch": 0.0034285714285714284,
      "grad_norm": 0.28048911690711975,
      "kl": 0.044189453125,
      "learning_rate": 1e-07,
      "loss": 0.153,
      "reward": -0.19587285071611404,
      "reward_std": 0.4000375494360924,
      "rewards/cosine_scaled_reward": -0.1812697658315301,
      "rewards/format_reward": 0.1666666716337204,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3141.4583435058594,
      "epoch": 0.004,
      "grad_norm": 0.17636096477508545,
      "kl": 0.047271728515625,
      "learning_rate": 1.2e-07,
      "loss": 0.0253,
      "reward": -0.18599995225667953,
      "reward_std": 0.2766329199075699,
      "rewards/cosine_scaled_reward": -0.19716664776206017,
      "rewards/format_reward": 0.2083333432674408,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3150.5833435058594,
      "epoch": 0.004571428571428572,
      "grad_norm": 0.23299627006053925,
      "kl": 0.042236328125,
      "learning_rate": 1.4e-07,
      "loss": 0.0593,
      "reward": -0.5499451458454132,
      "reward_std": 0.25627370551228523,
      "rewards/cosine_scaled_reward": -0.3791392296552658,
      "rewards/format_reward": 0.2083333432674408,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3046.666717529297,
      "epoch": 0.005142857142857143,
      "grad_norm": 0.23352369666099548,
      "kl": 0.04388427734375,
      "learning_rate": 1.6e-07,
      "loss": 0.0801,
      "reward": 0.21459830552339554,
      "reward_std": 0.5846549328416586,
      "rewards/cosine_scaled_reward": -0.05936753377318382,
      "rewards/format_reward": 0.3333333358168602,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3295.9583740234375,
      "epoch": 0.005714285714285714,
      "grad_norm": 0.1519400179386139,
      "kl": 0.03594970703125,
      "learning_rate": 1.8e-07,
      "loss": 0.0366,
      "reward": 0.6032139137387276,
      "reward_std": 1.0609627589583397,
      "rewards/cosine_scaled_reward": 0.11410695873200893,
      "rewards/format_reward": 0.3750000111758709,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2851.3333740234375,
      "epoch": 0.006285714285714286,
      "grad_norm": 0.2642011344432831,
      "kl": 0.05914306640625,
      "learning_rate": 2e-07,
      "loss": 0.0972,
      "reward": -0.20808421075344086,
      "reward_std": 0.41022560093551874,
      "rewards/cosine_scaled_reward": -0.24987544119358063,
      "rewards/format_reward": 0.2916666679084301,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3317.0833740234375,
      "epoch": 0.006857142857142857,
      "grad_norm": 0.18985775113105774,
      "kl": 0.0611572265625,
      "learning_rate": 2.1999999999999998e-07,
      "loss": 0.0187,
      "reward": -0.09276259876787663,
      "reward_std": 0.3626005630940199,
      "rewards/cosine_scaled_reward": -0.19221464078873396,
      "rewards/format_reward": 0.2916666679084301,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3030.250030517578,
      "epoch": 0.0074285714285714285,
      "grad_norm": 0.3722766935825348,
      "kl": 0.043060302734375,
      "learning_rate": 2.4e-07,
      "loss": 0.203,
      "reward": 0.2502866378054023,
      "reward_std": 0.5806700736284256,
      "rewards/cosine_scaled_reward": -0.020690007135272026,
      "rewards/format_reward": 0.2916666716337204,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3173.0833740234375,
      "epoch": 0.008,
      "grad_norm": 0.21710394322872162,
      "kl": 0.039794921875,
      "learning_rate": 2.6e-07,
      "loss": 0.128,
      "reward": -0.06073956936597824,
      "reward_std": 0.5397324226796627,
      "rewards/cosine_scaled_reward": -0.15536978468298912,
      "rewards/format_reward": 0.2500000111758709,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3022.125,
      "epoch": 0.008571428571428572,
      "grad_norm": 0.2290153205394745,
      "kl": 0.0469970703125,
      "learning_rate": 2.8e-07,
      "loss": 0.1717,
      "reward": -0.36692222114652395,
      "reward_std": 0.1796758584678173,
      "rewards/cosine_scaled_reward": -0.2876277659088373,
      "rewards/format_reward": 0.2083333432674408,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3381.4583740234375,
      "epoch": 0.009142857142857144,
      "grad_norm": 0.18353527784347534,
      "kl": 0.045654296875,
      "learning_rate": 3e-07,
      "loss": 0.035,
      "reward": 0.09733710438013077,
      "reward_std": 0.7802678793668747,
      "rewards/cosine_scaled_reward": -0.05549812689423561,
      "rewards/format_reward": 0.2083333395421505,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.009714285714285713,
      "grad_norm": 0.17221970856189728,
      "kl": 0.05108642578125,
      "learning_rate": 3.2e-07,
      "loss": 0.0002,
      "reward": -0.521545228548348,
      "reward_std": 0.3114899694919586,
      "rewards/cosine_scaled_reward": -0.28160594776272774,
      "rewards/format_reward": 0.0416666679084301,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3568.375,
      "epoch": 0.010285714285714285,
      "grad_norm": 0.146370992064476,
      "kl": 0.04718017578125,
      "learning_rate": 3.4000000000000003e-07,
      "loss": 0.0092,
      "reward": -0.44301459565758705,
      "reward_std": 0.5276463013142347,
      "rewards/cosine_scaled_reward": -0.24234064668416977,
      "rewards/format_reward": 0.0416666679084301,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3245.1666870117188,
      "epoch": 0.010857142857142857,
      "grad_norm": 0.1941138356924057,
      "kl": 0.04986572265625,
      "learning_rate": 3.6e-07,
      "loss": 0.0432,
      "reward": 0.06469105184078217,
      "reward_std": 0.7173988372087479,
      "rewards/cosine_scaled_reward": -0.11348781548440456,
      "rewards/format_reward": 0.2916666716337204,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2396.625030517578,
      "epoch": 0.011428571428571429,
      "grad_norm": 0.2624322772026062,
      "kl": 0.0740966796875,
      "learning_rate": 3.7999999999999996e-07,
      "loss": 0.034,
      "reward": 0.6277919709682465,
      "reward_std": 0.7424036711454391,
      "rewards/cosine_scaled_reward": 0.06389598548412323,
      "rewards/format_reward": 0.5000000111758709,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2653.6250610351562,
      "epoch": 0.012,
      "grad_norm": 0.2804762125015259,
      "kl": 0.05694580078125,
      "learning_rate": 4e-07,
      "loss": -0.0701,
      "reward": -0.026539891958236694,
      "reward_std": 0.4789135903120041,
      "rewards/cosine_scaled_reward": -0.26326995715498924,
      "rewards/format_reward": 0.5000000111758709,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3540.2083740234375,
      "epoch": 0.012571428571428572,
      "grad_norm": 0.13607840240001678,
      "kl": 0.04083251953125,
      "learning_rate": 4.1999999999999995e-07,
      "loss": 0.0263,
      "reward": -0.28362276405096054,
      "reward_std": 0.4733579605817795,
      "rewards/cosine_scaled_reward": -0.16264472343027592,
      "rewards/format_reward": 0.0416666679084301,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3071.9166870117188,
      "epoch": 0.013142857142857144,
      "grad_norm": 0.3339034914970398,
      "kl": 0.05059814453125,
      "learning_rate": 4.3999999999999997e-07,
      "loss": 0.1545,
      "reward": -0.3420925512909889,
      "reward_std": 0.4987642988562584,
      "rewards/cosine_scaled_reward": -0.27521293610334396,
      "rewards/format_reward": 0.2083333358168602,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3343.0000610351562,
      "epoch": 0.013714285714285714,
      "grad_norm": 0.19908681511878967,
      "kl": 0.04998779296875,
      "learning_rate": 4.6e-07,
      "loss": 0.0843,
      "reward": -0.06604887545108795,
      "reward_std": 0.8876266591250896,
      "rewards/cosine_scaled_reward": -0.11635777913033962,
      "rewards/format_reward": 0.1666666679084301,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2447.2500610351562,
      "epoch": 0.014285714285714285,
      "grad_norm": 0.2169143408536911,
      "kl": 0.044097900390625,
      "learning_rate": 4.8e-07,
      "loss": 0.014,
      "reward": 0.9716870114207268,
      "reward_std": 0.41379065811634064,
      "rewards/cosine_scaled_reward": 0.1733434647321701,
      "rewards/format_reward": 0.625,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3371.125,
      "epoch": 0.014857142857142857,
      "grad_norm": 0.1605193167924881,
      "kl": 0.049896240234375,
      "learning_rate": 5e-07,
      "loss": 0.0591,
      "reward": -0.2308735428377986,
      "reward_std": 0.41561231948435307,
      "rewards/cosine_scaled_reward": -0.1779367751441896,
      "rewards/format_reward": 0.125,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3489.125,
      "epoch": 0.015428571428571429,
      "grad_norm": 0.14547424018383026,
      "kl": 0.043914794921875,
      "learning_rate": 5.2e-07,
      "loss": 0.0077,
      "reward": -0.3666146732866764,
      "reward_std": 0.313198696821928,
      "rewards/cosine_scaled_reward": -0.24580733105540276,
      "rewards/format_reward": 0.125,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3418.5833740234375,
      "epoch": 0.016,
      "grad_norm": 0.19328951835632324,
      "kl": 0.04888916015625,
      "learning_rate": 5.4e-07,
      "loss": 0.057,
      "reward": -0.2427891194820404,
      "reward_std": 0.4138132072985172,
      "rewards/cosine_scaled_reward": -0.18389457929879427,
      "rewards/format_reward": 0.125,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2894.2083435058594,
      "epoch": 0.01657142857142857,
      "grad_norm": 0.20895135402679443,
      "kl": 0.0489501953125,
      "learning_rate": 5.6e-07,
      "loss": -0.0056,
      "reward": 0.09402500465512276,
      "reward_std": 0.35269030928611755,
      "rewards/cosine_scaled_reward": -0.07798751257359982,
      "rewards/format_reward": 0.25,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.017142857142857144,
      "grad_norm": 0.2097480595111847,
      "kl": 0.066162109375,
      "learning_rate": 5.8e-07,
      "loss": 0.0003,
      "reward": -0.6841397285461426,
      "reward_std": 0.1242629922926426,
      "rewards/cosine_scaled_reward": -0.3420698642730713,
      "rewards/format_reward": 0.0,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2740.9166870117188,
      "epoch": 0.017714285714285714,
      "grad_norm": 0.2932406961917877,
      "kl": 0.0579833984375,
      "learning_rate": 6e-07,
      "loss": 0.1181,
      "reward": -0.3630891740322113,
      "reward_std": 0.44184136018157005,
      "rewards/cosine_scaled_reward": -0.32737791910767555,
      "rewards/format_reward": 0.2916666679084301,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2855.375,
      "epoch": 0.018285714285714287,
      "grad_norm": 0.18361368775367737,
      "kl": 0.04327392578125,
      "learning_rate": 6.2e-07,
      "loss": -0.0227,
      "reward": 0.5959962904453278,
      "reward_std": 0.762365136295557,
      "rewards/cosine_scaled_reward": 0.06883148103952408,
      "rewards/format_reward": 0.4583333432674408,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.018857142857142857,
      "grad_norm": 0.16273260116577148,
      "kl": 0.0531005859375,
      "learning_rate": 6.4e-07,
      "loss": 0.0002,
      "reward": -0.4899278059601784,
      "reward_std": 0.16865173168480396,
      "rewards/cosine_scaled_reward": -0.2449638955295086,
      "rewards/format_reward": 0.0,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3541.4166870117188,
      "epoch": 0.019428571428571427,
      "grad_norm": 0.18354347348213196,
      "kl": 0.045654296875,
      "learning_rate": 6.6e-07,
      "loss": 0.0147,
      "reward": -0.531697541475296,
      "reward_std": 0.3807820826768875,
      "rewards/cosine_scaled_reward": -0.2866821028292179,
      "rewards/format_reward": 0.0416666679084301,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3476.3333740234375,
      "epoch": 0.02,
      "grad_norm": 0.15978805720806122,
      "kl": 0.0550537109375,
      "learning_rate": 6.800000000000001e-07,
      "loss": 0.0515,
      "reward": -0.6665981858968735,
      "reward_std": 0.2222603689879179,
      "rewards/cosine_scaled_reward": -0.3541324511170387,
      "rewards/format_reward": 0.0416666679084301,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3353.0416870117188,
      "epoch": 0.02057142857142857,
      "grad_norm": 0.22307412326335907,
      "kl": 0.05029296875,
      "learning_rate": 7e-07,
      "loss": 0.0934,
      "reward": -0.26416327990591526,
      "reward_std": 0.2276486847549677,
      "rewards/cosine_scaled_reward": -0.1945816483348608,
      "rewards/format_reward": 0.1250000037252903,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.021142857142857144,
      "grad_norm": 0.1473287045955658,
      "kl": 0.040771484375,
      "learning_rate": 7.2e-07,
      "loss": 0.0002,
      "reward": -0.50784532725811,
      "reward_std": 0.24061324447393417,
      "rewards/cosine_scaled_reward": -0.2539226710796356,
      "rewards/format_reward": 0.0,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3406.416748046875,
      "epoch": 0.021714285714285714,
      "grad_norm": 0.22248540818691254,
      "kl": 0.0460205078125,
      "learning_rate": 7.4e-07,
      "loss": 0.0835,
      "reward": -0.5920155718922615,
      "reward_std": 0.3137332946062088,
      "rewards/cosine_scaled_reward": -0.37934111058712006,
      "rewards/format_reward": 0.1666666679084301,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2601.416717529297,
      "epoch": 0.022285714285714287,
      "grad_norm": 0.4459127187728882,
      "kl": 0.0550537109375,
      "learning_rate": 7.599999999999999e-07,
      "loss": 0.3106,
      "reward": 0.2684231176972389,
      "reward_std": 0.47980744019150734,
      "rewards/cosine_scaled_reward": -0.09495509788393974,
      "rewards/format_reward": 0.4583333544433117,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.022857142857142857,
      "grad_norm": 0.15880325436592102,
      "kl": 0.04510498046875,
      "learning_rate": 7.799999999999999e-07,
      "loss": 0.0002,
      "reward": -0.6260051801800728,
      "reward_std": 0.1923852041363716,
      "rewards/cosine_scaled_reward": -0.3130025826394558,
      "rewards/format_reward": 0.0,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3001.625030517578,
      "epoch": 0.023428571428571427,
      "grad_norm": 0.16983532905578613,
      "kl": 0.05487060546875,
      "learning_rate": 8e-07,
      "loss": 0.041,
      "reward": 0.07866484671831131,
      "reward_std": 0.4578991234302521,
      "rewards/cosine_scaled_reward": -0.10650091245770454,
      "rewards/format_reward": 0.2916666679084301,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2764.000030517578,
      "epoch": 0.024,
      "grad_norm": 0.24087683856487274,
      "kl": 0.038360595703125,
      "learning_rate": 8.199999999999999e-07,
      "loss": 0.0531,
      "reward": 0.32172612100839615,
      "reward_std": 0.48358193784952164,
      "rewards/cosine_scaled_reward": -0.026636939495801926,
      "rewards/format_reward": 0.375,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.02457142857142857,
      "grad_norm": 0.21731670200824738,
      "kl": 0.0496826171875,
      "learning_rate": 8.399999999999999e-07,
      "loss": 0.0002,
      "reward": -0.5966501906514168,
      "reward_std": 0.23214636743068695,
      "rewards/cosine_scaled_reward": -0.298325102776289,
      "rewards/format_reward": 0.0,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.025142857142857144,
      "grad_norm": 0.16481076180934906,
      "kl": 0.044677734375,
      "learning_rate": 8.599999999999999e-07,
      "loss": 0.0002,
      "reward": -0.6713002845644951,
      "reward_std": 0.19106930866837502,
      "rewards/cosine_scaled_reward": -0.33565014228224754,
      "rewards/format_reward": 0.0,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3372.2500610351562,
      "epoch": 0.025714285714285714,
      "grad_norm": 0.17415094375610352,
      "kl": 0.0469970703125,
      "learning_rate": 8.799999999999999e-07,
      "loss": 0.0527,
      "reward": -0.5838596299290657,
      "reward_std": 0.42229044809937477,
      "rewards/cosine_scaled_reward": -0.31276314333081245,
      "rewards/format_reward": 0.0416666679084301,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3363.375,
      "epoch": 0.026285714285714287,
      "grad_norm": 0.21891918778419495,
      "kl": 0.04705810546875,
      "learning_rate": 9e-07,
      "loss": 0.08,
      "reward": -0.1459154188632965,
      "reward_std": 0.8675736896693707,
      "rewards/cosine_scaled_reward": -0.17712438106536865,
      "rewards/format_reward": 0.2083333358168602,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2864.9583435058594,
      "epoch": 0.026857142857142857,
      "grad_norm": 0.20759688317775726,
      "kl": 0.03955078125,
      "learning_rate": 9.2e-07,
      "loss": 0.0259,
      "reward": -0.1351792812347412,
      "reward_std": 0.3917945884168148,
      "rewards/cosine_scaled_reward": -0.19258963316679,
      "rewards/format_reward": 0.25,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2618.416717529297,
      "epoch": 0.027428571428571427,
      "grad_norm": 0.19388090074062347,
      "kl": 0.04571533203125,
      "learning_rate": 9.399999999999999e-07,
      "loss": 0.072,
      "reward": 0.5992091596126556,
      "reward_std": 1.018235296010971,
      "rewards/cosine_scaled_reward": 0.007937910500913858,
      "rewards/format_reward": 0.5833333432674408,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2918.5833435058594,
      "epoch": 0.028,
      "grad_norm": 0.27869799733161926,
      "kl": 0.0540771484375,
      "learning_rate": 9.6e-07,
      "loss": 0.1327,
      "reward": 0.18005166947841644,
      "reward_std": 0.7894617840647697,
      "rewards/cosine_scaled_reward": -0.07664081640541553,
      "rewards/format_reward": 0.3333333432674408,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3575.375,
      "epoch": 0.02857142857142857,
      "grad_norm": 0.16276530921459198,
      "kl": 0.04864501953125,
      "learning_rate": 9.8e-07,
      "loss": 0.0023,
      "reward": -0.4990532919764519,
      "reward_std": 0.28135714679956436,
      "rewards/cosine_scaled_reward": -0.29119331762194633,
      "rewards/format_reward": 0.0833333358168602,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2936.0,
      "epoch": 0.029142857142857144,
      "grad_norm": 0.14888997375965118,
      "kl": 0.04449462890625,
      "learning_rate": 1e-06,
      "loss": -0.043,
      "reward": 0.0526563823223114,
      "reward_std": 0.32037340477108955,
      "rewards/cosine_scaled_reward": -0.0986718013882637,
      "rewards/format_reward": 0.25,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.029714285714285714,
      "grad_norm": 0.16475068032741547,
      "kl": 0.04327392578125,
      "learning_rate": 9.999890338174275e-07,
      "loss": 0.0002,
      "reward": -0.734376922249794,
      "reward_std": 0.2161643784493208,
      "rewards/cosine_scaled_reward": -0.3671884685754776,
      "rewards/format_reward": 0.0,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3400.1666870117188,
      "epoch": 0.030285714285714287,
      "grad_norm": 0.17514079809188843,
      "kl": 0.03863525390625,
      "learning_rate": 9.999561358041868e-07,
      "loss": 0.0608,
      "reward": -0.09651139751076698,
      "reward_std": 0.5052468162029982,
      "rewards/cosine_scaled_reward": -0.11075571551918983,
      "rewards/format_reward": 0.125,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3467.6666870117188,
      "epoch": 0.030857142857142857,
      "grad_norm": 0.19022035598754883,
      "kl": 0.05419921875,
      "learning_rate": 9.999013075636804e-07,
      "loss": 0.0556,
      "reward": -0.5887648984789848,
      "reward_std": 0.2710861638188362,
      "rewards/cosine_scaled_reward": -0.3360491245985031,
      "rewards/format_reward": 0.0833333358168602,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.03142857142857143,
      "grad_norm": 0.15405318140983582,
      "kl": 0.05474853515625,
      "learning_rate": 9.998245517681593e-07,
      "loss": 0.0002,
      "reward": -0.6332229375839233,
      "reward_std": 0.44320254772901535,
      "rewards/cosine_scaled_reward": -0.33744481950998306,
      "rewards/format_reward": 0.0416666679084301,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3520.0,
      "epoch": 0.032,
      "grad_norm": 0.16492366790771484,
      "kl": 0.0576171875,
      "learning_rate": 9.997258721585931e-07,
      "loss": 0.0222,
      "reward": -0.5373398922383785,
      "reward_std": 0.3259655721485615,
      "rewards/cosine_scaled_reward": -0.31033661775290966,
      "rewards/format_reward": 0.0833333358168602,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3196.5416870117188,
      "epoch": 0.03257142857142857,
      "grad_norm": 0.3228585124015808,
      "kl": 0.09649658203125,
      "learning_rate": 9.996052735444862e-07,
      "loss": -0.0253,
      "reward": 0.02994374930858612,
      "reward_std": 0.391297597438097,
      "rewards/cosine_scaled_reward": -0.11002812534570694,
      "rewards/format_reward": 0.25,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.03314285714285714,
      "grad_norm": 0.15809538960456848,
      "kl": 0.0439453125,
      "learning_rate": 9.994627618036452e-07,
      "loss": 0.0002,
      "reward": -0.7625578194856644,
      "reward_std": 0.20482752844691277,
      "rewards/cosine_scaled_reward": -0.381278894841671,
      "rewards/format_reward": 0.0,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.03371428571428572,
      "grad_norm": 0.16887404024600983,
      "kl": 0.0562744140625,
      "learning_rate": 9.992983438818915e-07,
      "loss": 0.0002,
      "reward": -0.511197448708117,
      "reward_std": 0.14204201754182577,
      "rewards/cosine_scaled_reward": -0.2555987243540585,
      "rewards/format_reward": 0.0,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3469.0416870117188,
      "epoch": 0.03428571428571429,
      "grad_norm": 0.14915870130062103,
      "kl": 0.03826904296875,
      "learning_rate": 9.991120277927223e-07,
      "loss": 0.0642,
      "reward": -0.617660641670227,
      "reward_std": 0.24892418831586838,
      "rewards/cosine_scaled_reward": -0.329663660377264,
      "rewards/format_reward": 0.0416666679084301,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2802.791717529297,
      "epoch": 0.03485714285714286,
      "grad_norm": 0.20380046963691711,
      "kl": 0.0477294921875,
      "learning_rate": 9.989038226169207e-07,
      "loss": 0.1085,
      "reward": -0.008406132459640503,
      "reward_std": 0.8550728969275951,
      "rewards/cosine_scaled_reward": -0.17086973786354065,
      "rewards/format_reward": 0.3333333358168602,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2853.2083435058594,
      "epoch": 0.03542857142857143,
      "grad_norm": 0.31721076369285583,
      "kl": 0.06878662109375,
      "learning_rate": 9.98673738502114e-07,
      "loss": 0.0961,
      "reward": -0.38323642313480377,
      "reward_std": 0.25505492370575666,
      "rewards/cosine_scaled_reward": -0.3166182152926922,
      "rewards/format_reward": 0.25,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.036,
      "grad_norm": 0.15038253366947174,
      "kl": 0.04864501953125,
      "learning_rate": 9.98421786662277e-07,
      "loss": 0.0002,
      "reward": -0.4516504108905792,
      "reward_std": 0.26408347859978676,
      "rewards/cosine_scaled_reward": -0.22582519799470901,
      "rewards/format_reward": 0.0,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3534.2916870117188,
      "epoch": 0.036571428571428574,
      "grad_norm": 0.1737290471792221,
      "kl": 0.049896240234375,
      "learning_rate": 9.981479793771866e-07,
      "loss": 0.0283,
      "reward": -0.6200313568115234,
      "reward_std": 0.1568075306713581,
      "rewards/cosine_scaled_reward": -0.3308490067720413,
      "rewards/format_reward": 0.0416666679084301,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.037142857142857144,
      "grad_norm": 0.15397769212722778,
      "kl": 0.05743408203125,
      "learning_rate": 9.97852329991824e-07,
      "loss": 0.0002,
      "reward": -0.4742198493331671,
      "reward_std": 0.20335539802908897,
      "rewards/cosine_scaled_reward": -0.2371099255979061,
      "rewards/format_reward": 0.0,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3236.666748046875,
      "epoch": 0.037714285714285714,
      "grad_norm": 0.19795703887939453,
      "kl": 0.03839111328125,
      "learning_rate": 9.975348529157229e-07,
      "loss": 0.1441,
      "reward": 0.2815367206931114,
      "reward_std": 1.0324797630310059,
      "rewards/cosine_scaled_reward": -0.025898311287164688,
      "rewards/format_reward": 0.3333333395421505,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2816.375,
      "epoch": 0.038285714285714284,
      "grad_norm": 0.19631832838058472,
      "kl": 0.04827880859375,
      "learning_rate": 9.971955636222684e-07,
      "loss": -0.0521,
      "reward": -0.030950482934713364,
      "reward_std": 0.6847976944409311,
      "rewards/cosine_scaled_reward": -0.16130859032273293,
      "rewards/format_reward": 0.2916666679084301,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.038857142857142854,
      "grad_norm": 0.271883100271225,
      "kl": 0.07073974609375,
      "learning_rate": 9.968344786479415e-07,
      "loss": 0.0003,
      "reward": -0.6612199693918228,
      "reward_std": 0.1911415420472622,
      "rewards/cosine_scaled_reward": -0.3306099846959114,
      "rewards/format_reward": 0.0,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3492.9583740234375,
      "epoch": 0.03942857142857143,
      "grad_norm": 0.14704446494579315,
      "kl": 0.04931640625,
      "learning_rate": 9.964516155915151e-07,
      "loss": 0.0101,
      "reward": -0.46425507962703705,
      "reward_std": 0.4483284428715706,
      "rewards/cosine_scaled_reward": -0.2946275472640991,
      "rewards/format_reward": 0.125,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3439.3750610351562,
      "epoch": 0.04,
      "grad_norm": 0.2434043288230896,
      "kl": 0.05535888671875,
      "learning_rate": 9.960469931131936e-07,
      "loss": 0.0798,
      "reward": -0.6196610480546951,
      "reward_std": 0.322536863386631,
      "rewards/cosine_scaled_reward": -0.35149718821048737,
      "rewards/format_reward": 0.0833333358168602,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2774.1250610351562,
      "epoch": 0.04057142857142857,
      "grad_norm": 0.32268911600112915,
      "kl": 0.0482177734375,
      "learning_rate": 9.956206309337066e-07,
      "loss": 0.1546,
      "reward": 0.546534039080143,
      "reward_std": 0.8966285344213247,
      "rewards/cosine_scaled_reward": 0.0024337023496627808,
      "rewards/format_reward": 0.5416666865348816,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3378.0833740234375,
      "epoch": 0.04114285714285714,
      "grad_norm": 0.17585237324237823,
      "kl": 0.0487060546875,
      "learning_rate": 9.951725498333448e-07,
      "loss": 0.066,
      "reward": -0.14741092920303345,
      "reward_std": 0.6694340538233519,
      "rewards/cosine_scaled_reward": -0.15703882090747356,
      "rewards/format_reward": 0.1666666679084301,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3107.8750610351562,
      "epoch": 0.04171428571428572,
      "grad_norm": 0.24229124188423157,
      "kl": 0.05169677734375,
      "learning_rate": 9.947027716509488e-07,
      "loss": 0.0981,
      "reward": -0.270541962236166,
      "reward_std": 0.5891504883766174,
      "rewards/cosine_scaled_reward": -0.30193765088915825,
      "rewards/format_reward": 0.3333333432674408,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3386.25,
      "epoch": 0.04228571428571429,
      "grad_norm": 0.16542011499404907,
      "kl": 0.04302978515625,
      "learning_rate": 9.942113192828444e-07,
      "loss": 0.0643,
      "reward": -0.47412845492362976,
      "reward_std": 0.3468447830528021,
      "rewards/cosine_scaled_reward": -0.2787308990955353,
      "rewards/format_reward": 0.0833333358168602,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3475.6666870117188,
      "epoch": 0.04285714285714286,
      "grad_norm": 0.1598564237356186,
      "kl": 0.04718017578125,
      "learning_rate": 9.93698216681727e-07,
      "loss": 0.0421,
      "reward": -0.3513486757874489,
      "reward_std": 0.7791556939482689,
      "rewards/cosine_scaled_reward": -0.23817433044314384,
      "rewards/format_reward": 0.1250000037252903,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3499.7083740234375,
      "epoch": 0.04342857142857143,
      "grad_norm": 0.15245261788368225,
      "kl": 0.05072021484375,
      "learning_rate": 9.931634888554935e-07,
      "loss": 0.0273,
      "reward": -0.26299357414245605,
      "reward_std": 0.615978293120861,
      "rewards/cosine_scaled_reward": -0.19399680197238922,
      "rewards/format_reward": 0.125,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3544.5416870117188,
      "epoch": 0.044,
      "grad_norm": 0.17183570563793182,
      "kl": 0.05072021484375,
      "learning_rate": 9.926071618660237e-07,
      "loss": 0.015,
      "reward": -0.29701984860002995,
      "reward_std": 0.553566699847579,
      "rewards/cosine_scaled_reward": -0.23184325452893972,
      "rewards/format_reward": 0.1666666679084301,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3498.0000610351562,
      "epoch": 0.044571428571428574,
      "grad_norm": 0.16006886959075928,
      "kl": 0.04241943359375,
      "learning_rate": 9.9202926282791e-07,
      "loss": 0.0266,
      "reward": -0.0037414096295833588,
      "reward_std": 0.9226736649870872,
      "rewards/cosine_scaled_reward": -0.10603736154735088,
      "rewards/format_reward": 0.2083333358168602,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3099.75,
      "epoch": 0.045142857142857144,
      "grad_norm": 0.15230302512645721,
      "kl": 0.0433349609375,
      "learning_rate": 9.91429819907136e-07,
      "loss": -0.0371,
      "reward": -0.008567571640014648,
      "reward_std": 0.4445110894739628,
      "rewards/cosine_scaled_reward": -0.12928379327058792,
      "rewards/format_reward": 0.25,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3559.0833740234375,
      "epoch": 0.045714285714285714,
      "grad_norm": 0.1428958624601364,
      "kl": 0.0391845703125,
      "learning_rate": 9.908088623197048e-07,
      "loss": 0.0109,
      "reward": -0.22227831184864044,
      "reward_std": 0.46194060891866684,
      "rewards/cosine_scaled_reward": -0.13197248615324497,
      "rewards/format_reward": 0.0416666679084301,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3474.541748046875,
      "epoch": 0.046285714285714284,
      "grad_norm": 0.19979149103164673,
      "kl": 0.0528564453125,
      "learning_rate": 9.901664203302124e-07,
      "loss": 0.0265,
      "reward": 0.20810034382157028,
      "reward_std": 0.55513103492558,
      "rewards/cosine_scaled_reward": -0.00011649727821350098,
      "rewards/format_reward": 0.2083333395421505,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3402.75,
      "epoch": 0.046857142857142854,
      "grad_norm": 0.16143812239170074,
      "kl": 0.04791259765625,
      "learning_rate": 9.895025252503755e-07,
      "loss": 0.0423,
      "reward": -0.17914994060993195,
      "reward_std": 0.677577305585146,
      "rewards/cosine_scaled_reward": -0.17290829867124557,
      "rewards/format_reward": 0.1666666716337204,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2067.291702270508,
      "epoch": 0.04742857142857143,
      "grad_norm": 0.2830398380756378,
      "kl": 0.073974609375,
      "learning_rate": 9.888172094375033e-07,
      "loss": 0.1007,
      "reward": 0.6157565079629421,
      "reward_std": 0.6514625661075115,
      "rewards/cosine_scaled_reward": -0.025455085560679436,
      "rewards/format_reward": 0.6666666716337204,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3327.916748046875,
      "epoch": 0.048,
      "grad_norm": 0.18658600747585297,
      "kl": 0.0465087890625,
      "learning_rate": 9.881105062929221e-07,
      "loss": 0.049,
      "reward": -0.15906250849366188,
      "reward_std": 0.772390453144908,
      "rewards/cosine_scaled_reward": -0.2045312598347664,
      "rewards/format_reward": 0.2500000074505806,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3180.3333740234375,
      "epoch": 0.04857142857142857,
      "grad_norm": 0.16970154643058777,
      "kl": 0.05596923828125,
      "learning_rate": 9.873824502603459e-07,
      "loss": 0.0349,
      "reward": 0.2883519548922777,
      "reward_std": 0.6625581197440624,
      "rewards/cosine_scaled_reward": -0.02249070629477501,
      "rewards/format_reward": 0.3333333432674408,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3541.875,
      "epoch": 0.04914285714285714,
      "grad_norm": 0.15287643671035767,
      "kl": 0.04180908203125,
      "learning_rate": 9.866330768241983e-07,
      "loss": 0.0253,
      "reward": -0.5468939123675227,
      "reward_std": 0.47659813798964024,
      "rewards/cosine_scaled_reward": -0.29428029619157314,
      "rewards/format_reward": 0.0416666679084301,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3427.7916870117188,
      "epoch": 0.04971428571428571,
      "grad_norm": 0.1800970584154129,
      "kl": 0.052978515625,
      "learning_rate": 9.85862422507884e-07,
      "loss": 0.0337,
      "reward": -0.22589577734470367,
      "reward_std": 0.6039449013769627,
      "rewards/cosine_scaled_reward": -0.19628122448921204,
      "rewards/format_reward": 0.1666666716337204,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3364.125,
      "epoch": 0.05028571428571429,
      "grad_norm": 0.1843470185995102,
      "kl": 0.05328369140625,
      "learning_rate": 9.850705248720068e-07,
      "loss": 0.0534,
      "reward": -0.588555134832859,
      "reward_std": 0.29554150719195604,
      "rewards/cosine_scaled_reward": -0.37761090695858,
      "rewards/format_reward": 0.1666666716337204,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3422.7916870117188,
      "epoch": 0.05085714285714286,
      "grad_norm": 0.16204486787319183,
      "kl": 0.050537109375,
      "learning_rate": 9.8425742251254e-07,
      "loss": 0.0307,
      "reward": -0.49031344801187515,
      "reward_std": 0.3072348916903138,
      "rewards/cosine_scaled_reward": -0.3284900598227978,
      "rewards/format_reward": 0.1666666716337204,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2920.5833740234375,
      "epoch": 0.05142857142857143,
      "grad_norm": 0.2106999307870865,
      "kl": 0.041412353515625,
      "learning_rate": 9.83423155058946e-07,
      "loss": 0.1641,
      "reward": 0.3364746905863285,
      "reward_std": 0.8183911889791489,
      "rewards/cosine_scaled_reward": -0.04009598679840565,
      "rewards/format_reward": 0.4166666828095913,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3493.7083740234375,
      "epoch": 0.052,
      "grad_norm": 0.20229995250701904,
      "kl": 0.0487060546875,
      "learning_rate": 9.825677631722435e-07,
      "loss": 0.0375,
      "reward": -0.22211312502622604,
      "reward_std": 0.800605058670044,
      "rewards/cosine_scaled_reward": -0.17355656623840332,
      "rewards/format_reward": 0.1250000037252903,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3018.25,
      "epoch": 0.052571428571428575,
      "grad_norm": 0.2712525427341461,
      "kl": 0.04443359375,
      "learning_rate": 9.816912885430258e-07,
      "loss": 0.1815,
      "reward": 0.07752631604671478,
      "reward_std": 0.3367920182645321,
      "rewards/cosine_scaled_reward": -0.065403513610363,
      "rewards/format_reward": 0.2083333432674408,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3192.416748046875,
      "epoch": 0.053142857142857144,
      "grad_norm": 0.2336161732673645,
      "kl": 0.05108642578125,
      "learning_rate": 9.807937738894303e-07,
      "loss": 0.1053,
      "reward": -0.15976980328559875,
      "reward_std": 0.66871527582407,
      "rewards/cosine_scaled_reward": -0.20488492399454117,
      "rewards/format_reward": 0.25,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3344.7916870117188,
      "epoch": 0.053714285714285714,
      "grad_norm": 0.19864843785762787,
      "kl": 0.0494384765625,
      "learning_rate": 9.798752629550546e-07,
      "loss": 0.1064,
      "reward": -0.45881245099008083,
      "reward_std": 0.6017686780542135,
      "rewards/cosine_scaled_reward": -0.3127395585179329,
      "rewards/format_reward": 0.1666666716337204,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3160.8751220703125,
      "epoch": 0.054285714285714284,
      "grad_norm": 0.7792258262634277,
      "kl": 0.142333984375,
      "learning_rate": 9.78935800506826e-07,
      "loss": 0.1646,
      "reward": -0.11449402663856745,
      "reward_std": 0.608274769037962,
      "rewards/cosine_scaled_reward": -0.18224701657891273,
      "rewards/format_reward": 0.2500000037252903,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3546.5,
      "epoch": 0.054857142857142854,
      "grad_norm": 0.15461793541908264,
      "kl": 0.045196533203125,
      "learning_rate": 9.779754323328192e-07,
      "loss": 0.0222,
      "reward": -0.1999459322541952,
      "reward_std": 0.6907303184270859,
      "rewards/cosine_scaled_reward": -0.12080629542469978,
      "rewards/format_reward": 0.0416666679084301,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3418.7083740234375,
      "epoch": 0.05542857142857143,
      "grad_norm": 0.17061863839626312,
      "kl": 0.05731201171875,
      "learning_rate": 9.769942052400235e-07,
      "loss": 0.0459,
      "reward": -0.578557875007391,
      "reward_std": 0.35796352103352547,
      "rewards/cosine_scaled_reward": -0.35177892446517944,
      "rewards/format_reward": 0.125,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2118.3333435058594,
      "epoch": 0.056,
      "grad_norm": 0.28902173042297363,
      "kl": 0.03961181640625,
      "learning_rate": 9.759921670520634e-07,
      "loss": -0.1185,
      "reward": 0.6223690360784531,
      "reward_std": 0.46096891909837723,
      "rewards/cosine_scaled_reward": 0.08201783150434494,
      "rewards/format_reward": 0.4583333432674408,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2805.000030517578,
      "epoch": 0.05657142857142857,
      "grad_norm": 0.21270987391471863,
      "kl": 0.0479736328125,
      "learning_rate": 9.749693666068663e-07,
      "loss": -0.0392,
      "reward": 0.10187321389093995,
      "reward_std": 0.4792479854077101,
      "rewards/cosine_scaled_reward": -0.09489673189818859,
      "rewards/format_reward": 0.2916666679084301,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2614.625045776367,
      "epoch": 0.05714285714285714,
      "grad_norm": 0.3168502449989319,
      "kl": 0.05035400390625,
      "learning_rate": 9.739258537542835e-07,
      "loss": 0.0458,
      "reward": 0.4905807599425316,
      "reward_std": 0.6621165350079536,
      "rewards/cosine_scaled_reward": 0.0161236971616745,
      "rewards/format_reward": 0.4583333432674408,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3412.8750610351562,
      "epoch": 0.05771428571428571,
      "grad_norm": 0.17035293579101562,
      "kl": 0.0479736328125,
      "learning_rate": 9.728616793536587e-07,
      "loss": 0.0864,
      "reward": -0.51438994333148,
      "reward_std": 0.4058373123407364,
      "rewards/cosine_scaled_reward": -0.2988616116344929,
      "rewards/format_reward": 0.0833333358168602,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.05828571428571429,
      "grad_norm": 0.1507677137851715,
      "kl": 0.0511474609375,
      "learning_rate": 9.717768952713511e-07,
      "loss": 0.0002,
      "reward": -0.6419448927044868,
      "reward_std": 0.14684983156621456,
      "rewards/cosine_scaled_reward": -0.3209724463522434,
      "rewards/format_reward": 0.0,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.05885714285714286,
      "grad_norm": 0.17621943354606628,
      "kl": 0.0430908203125,
      "learning_rate": 9.706715543782064e-07,
      "loss": 0.0002,
      "reward": -0.704314574599266,
      "reward_std": 0.20182611048221588,
      "rewards/cosine_scaled_reward": -0.3521573022007942,
      "rewards/format_reward": 0.0,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.05942857142857143,
      "grad_norm": 0.15782414376735687,
      "kl": 0.04888916015625,
      "learning_rate": 9.695457105469804e-07,
      "loss": 0.0002,
      "reward": -0.5785035863518715,
      "reward_std": 0.25241581723093987,
      "rewards/cosine_scaled_reward": -0.28925178572535515,
      "rewards/format_reward": 0.0,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3385.5416870117188,
      "epoch": 0.06,
      "grad_norm": 0.19064433872699738,
      "kl": 0.05072021484375,
      "learning_rate": 9.683994186497132e-07,
      "loss": 0.0669,
      "reward": -0.5965285524725914,
      "reward_std": 0.3510228842496872,
      "rewards/cosine_scaled_reward": -0.360764279961586,
      "rewards/format_reward": 0.125,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2878.3333435058594,
      "epoch": 0.060571428571428575,
      "grad_norm": 0.28683388233184814,
      "kl": 0.0538330078125,
      "learning_rate": 9.672327345550543e-07,
      "loss": -0.0562,
      "reward": 0.14185508340597153,
      "reward_std": 0.477683924138546,
      "rewards/cosine_scaled_reward": -0.05407246574759483,
      "rewards/format_reward": 0.25,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3124.2916870117188,
      "epoch": 0.061142857142857145,
      "grad_norm": 0.20470094680786133,
      "kl": 0.05194091796875,
      "learning_rate": 9.66045715125541e-07,
      "loss": 0.0054,
      "reward": -0.014784537255764008,
      "reward_std": 0.49005767330527306,
      "rewards/cosine_scaled_reward": -0.1115589402616024,
      "rewards/format_reward": 0.2083333432674408,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2828.9583435058594,
      "epoch": 0.061714285714285715,
      "grad_norm": 0.2078002393245697,
      "kl": 0.05157470703125,
      "learning_rate": 9.648384182148252e-07,
      "loss": 0.13,
      "reward": -0.3206620067358017,
      "reward_std": 0.299712959676981,
      "rewards/cosine_scaled_reward": -0.30616434663534164,
      "rewards/format_reward": 0.2916666679084301,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.062285714285714285,
      "grad_norm": 0.16639988124370575,
      "kl": 0.052459716796875,
      "learning_rate": 9.636109026648554e-07,
      "loss": 0.0002,
      "reward": -0.6066285073757172,
      "reward_std": 0.14198161102831364,
      "rewards/cosine_scaled_reward": -0.30331425555050373,
      "rewards/format_reward": 0.0,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3380.2916870117188,
      "epoch": 0.06285714285714286,
      "grad_norm": 0.14965060353279114,
      "kl": 0.04302978515625,
      "learning_rate": 9.623632283030077e-07,
      "loss": 0.0395,
      "reward": -0.39130744338035583,
      "reward_std": 0.36896876990795135,
      "rewards/cosine_scaled_reward": -0.2998203821480274,
      "rewards/format_reward": 0.2083333432674408,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2828.125,
      "epoch": 0.06342857142857143,
      "grad_norm": 0.24873219430446625,
      "kl": 0.0533447265625,
      "learning_rate": 9.610954559391704e-07,
      "loss": -0.0065,
      "reward": 0.19809278845787048,
      "reward_std": 0.43846164271235466,
      "rewards/cosine_scaled_reward": -0.025953616946935654,
      "rewards/format_reward": 0.25,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3578.875,
      "epoch": 0.064,
      "grad_norm": 0.14656062424182892,
      "kl": 0.0477294921875,
      "learning_rate": 9.598076473627796e-07,
      "loss": 0.003,
      "reward": -0.35288260877132416,
      "reward_std": 0.4478282080963254,
      "rewards/cosine_scaled_reward": -0.19727462995797396,
      "rewards/format_reward": 0.0416666679084301,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3449.7083740234375,
      "epoch": 0.06457142857142857,
      "grad_norm": 0.1971622109413147,
      "kl": 0.0526123046875,
      "learning_rate": 9.58499865339809e-07,
      "loss": 0.0869,
      "reward": -0.35047246143221855,
      "reward_std": 0.272010013461113,
      "rewards/cosine_scaled_reward": -0.19606954976916313,
      "rewards/format_reward": 0.0416666679084301,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3555.75,
      "epoch": 0.06514285714285714,
      "grad_norm": 0.14575761556625366,
      "kl": 0.0478515625,
      "learning_rate": 9.571721736097088e-07,
      "loss": 0.0159,
      "reward": -0.647302895784378,
      "reward_std": 0.2911082152277231,
      "rewards/cosine_scaled_reward": -0.34448477625846863,
      "rewards/format_reward": 0.0416666679084301,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3405.875,
      "epoch": 0.06571428571428571,
      "grad_norm": 0.19001184403896332,
      "kl": 0.05657958984375,
      "learning_rate": 9.55824636882301e-07,
      "loss": 0.0777,
      "reward": -0.39468052983283997,
      "reward_std": 0.31587182730436325,
      "rewards/cosine_scaled_reward": -0.23900692909955978,
      "rewards/format_reward": 0.0833333358168602,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.06628571428571428,
      "grad_norm": 0.14631710946559906,
      "kl": 0.04296875,
      "learning_rate": 9.54457320834625e-07,
      "loss": 0.0002,
      "reward": -0.8244208693504333,
      "reward_std": 0.12861562799662352,
      "rewards/cosine_scaled_reward": -0.41221044957637787,
      "rewards/format_reward": 0.0,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3315.2083740234375,
      "epoch": 0.06685714285714285,
      "grad_norm": 0.18092887103557587,
      "kl": 0.041412353515625,
      "learning_rate": 9.530702921077358e-07,
      "loss": 0.0548,
      "reward": 0.3274298645555973,
      "reward_std": 0.6206382885575294,
      "rewards/cosine_scaled_reward": 0.03871491365134716,
      "rewards/format_reward": 0.2500000111758709,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3117.3333435058594,
      "epoch": 0.06742857142857143,
      "grad_norm": 0.2688407003879547,
      "kl": 0.0552978515625,
      "learning_rate": 9.516636183034564e-07,
      "loss": 0.1092,
      "reward": -0.10805931687355042,
      "reward_std": 0.5824379585683346,
      "rewards/cosine_scaled_reward": -0.1790296584367752,
      "rewards/format_reward": 0.2500000111758709,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2461.9166870117188,
      "epoch": 0.068,
      "grad_norm": 0.3144585192203522,
      "kl": 0.07818603515625,
      "learning_rate": 9.502373679810839e-07,
      "loss": 0.0948,
      "reward": 0.7282524108886719,
      "reward_std": 0.7472279723733664,
      "rewards/cosine_scaled_reward": 0.13495950400829315,
      "rewards/format_reward": 0.4583333395421505,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.06857142857142857,
      "grad_norm": 0.15482838451862335,
      "kl": 0.05096435546875,
      "learning_rate": 9.487916106540465e-07,
      "loss": 0.0002,
      "reward": -0.7398529201745987,
      "reward_std": 0.2509063072502613,
      "rewards/cosine_scaled_reward": -0.36992645263671875,
      "rewards/format_reward": 0.0,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3361.0833740234375,
      "epoch": 0.06914285714285714,
      "grad_norm": 0.21992942690849304,
      "kl": 0.0552978515625,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.1049,
      "reward": -0.4808087758719921,
      "reward_std": 0.3338266760110855,
      "rewards/cosine_scaled_reward": -0.2820710465312004,
      "rewards/format_reward": 0.0833333358168602,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2814.4583740234375,
      "epoch": 0.06971428571428571,
      "grad_norm": 0.23910638689994812,
      "kl": 0.0614013671875,
      "learning_rate": 9.458418577899774e-07,
      "loss": 0.0569,
      "reward": 0.9797220379114151,
      "reward_std": 1.128523275256157,
      "rewards/cosine_scaled_reward": 0.17736097052693367,
      "rewards/format_reward": 0.6250000149011612,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3343.7083740234375,
      "epoch": 0.07028571428571428,
      "grad_norm": 0.17705029249191284,
      "kl": 0.0452880859375,
      "learning_rate": 9.443380060197385e-07,
      "loss": 0.0985,
      "reward": -0.2491093035787344,
      "reward_std": 0.7347416132688522,
      "rewards/cosine_scaled_reward": -0.24955465272068977,
      "rewards/format_reward": 0.2500000074505806,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3565.7083740234375,
      "epoch": 0.07085714285714285,
      "grad_norm": 0.14711324870586395,
      "kl": 0.05926513671875,
      "learning_rate": 9.428149347714143e-07,
      "loss": 0.0103,
      "reward": -0.35953105124644935,
      "reward_std": 0.38775753043591976,
      "rewards/cosine_scaled_reward": -0.20059886015951633,
      "rewards/format_reward": 0.0416666679084301,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3302.3750610351562,
      "epoch": 0.07142857142857142,
      "grad_norm": 0.19406969845294952,
      "kl": 0.05615234375,
      "learning_rate": 9.412727182773486e-07,
      "loss": 0.0624,
      "reward": -0.2552947551012039,
      "reward_std": 0.4256477430462837,
      "rewards/cosine_scaled_reward": -0.23181404545903206,
      "rewards/format_reward": 0.2083333358168602,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2113.375015258789,
      "epoch": 0.072,
      "grad_norm": 0.25354713201522827,
      "kl": 0.05657958984375,
      "learning_rate": 9.397114317029974e-07,
      "loss": 0.134,
      "reward": 0.26037219166755676,
      "reward_std": 0.45451565831899643,
      "rewards/cosine_scaled_reward": -0.09898056834936142,
      "rewards/format_reward": 0.4583333432674408,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3431.375,
      "epoch": 0.07257142857142856,
      "grad_norm": 0.1632765680551529,
      "kl": 0.05291748046875,
      "learning_rate": 9.381311511432658e-07,
      "loss": 0.0309,
      "reward": -0.036101870238780975,
      "reward_std": 0.7900894656777382,
      "rewards/cosine_scaled_reward": -0.10138426348567009,
      "rewards/format_reward": 0.1666666716337204,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3377.8333740234375,
      "epoch": 0.07314285714285715,
      "grad_norm": 0.1895456165075302,
      "kl": 0.048583984375,
      "learning_rate": 9.36531953618799e-07,
      "loss": 0.0875,
      "reward": 0.3225628361105919,
      "reward_std": 0.5909937657415867,
      "rewards/cosine_scaled_reward": 0.015448085963726044,
      "rewards/format_reward": 0.291666679084301,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3365.6666870117188,
      "epoch": 0.07371428571428572,
      "grad_norm": 0.19840599596500397,
      "kl": 0.04510498046875,
      "learning_rate": 9.34913917072228e-07,
      "loss": 0.0565,
      "reward": -0.14107680320739746,
      "reward_std": 0.6309686824679375,
      "rewards/cosine_scaled_reward": -0.21637173369526863,
      "rewards/format_reward": 0.2916666716337204,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3476.4583740234375,
      "epoch": 0.07428571428571429,
      "grad_norm": 0.1595059335231781,
      "kl": 0.051025390625,
      "learning_rate": 9.332771203643714e-07,
      "loss": -0.0039,
      "reward": -0.14000652357935905,
      "reward_std": 0.44728637486696243,
      "rewards/cosine_scaled_reward": -0.11166992946527898,
      "rewards/format_reward": 0.0833333358168602,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3205.7916870117188,
      "epoch": 0.07485714285714286,
      "grad_norm": 0.15057101845741272,
      "kl": 0.0469970703125,
      "learning_rate": 9.316216432703916e-07,
      "loss": -0.0239,
      "reward": -0.4409569948911667,
      "reward_std": 0.21845832839608192,
      "rewards/cosine_scaled_reward": -0.34547850489616394,
      "rewards/format_reward": 0.25,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3533.7083740234375,
      "epoch": 0.07542857142857143,
      "grad_norm": 0.14339689910411835,
      "kl": 0.04742431640625,
      "learning_rate": 9.299475664759068e-07,
      "loss": 0.0081,
      "reward": -0.17459139972925186,
      "reward_std": 0.366999352700077,
      "rewards/cosine_scaled_reward": -0.19146236404776573,
      "rewards/format_reward": 0.2083333432674408,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3345.7500610351562,
      "epoch": 0.076,
      "grad_norm": 0.28945690393447876,
      "kl": 0.068115234375,
      "learning_rate": 9.282549715730579e-07,
      "loss": 0.0761,
      "reward": -0.205301433801651,
      "reward_std": 0.7304530702531338,
      "rewards/cosine_scaled_reward": -0.185984056442976,
      "rewards/format_reward": 0.1666666716337204,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.07657142857142857,
      "grad_norm": 0.13178248703479767,
      "kl": 0.0406494140625,
      "learning_rate": 9.265439410565328e-07,
      "loss": 0.0002,
      "reward": -0.21339796762913465,
      "reward_std": 0.14427685737609863,
      "rewards/cosine_scaled_reward": -0.10669897636398673,
      "rewards/format_reward": 0.0,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2877.0833740234375,
      "epoch": 0.07714285714285714,
      "grad_norm": 0.6117133498191833,
      "kl": 0.062744140625,
      "learning_rate": 9.248145583195447e-07,
      "loss": -0.1135,
      "reward": -0.3104187399148941,
      "reward_std": 0.4863443411886692,
      "rewards/cosine_scaled_reward": -0.30104270949959755,
      "rewards/format_reward": 0.2916666679084301,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2646.166748046875,
      "epoch": 0.07771428571428571,
      "grad_norm": 0.2174384891986847,
      "kl": 0.078369140625,
      "learning_rate": 9.230669076497687e-07,
      "loss": 0.0099,
      "reward": 0.501250134781003,
      "reward_std": 0.6959330216050148,
      "rewards/cosine_scaled_reward": 0.0006250720471143723,
      "rewards/format_reward": 0.5000000149011612,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2857.291717529297,
      "epoch": 0.07828571428571429,
      "grad_norm": 0.19513949751853943,
      "kl": 0.03631591796875,
      "learning_rate": 9.213010742252327e-07,
      "loss": 0.0129,
      "reward": 0.05199408531188965,
      "reward_std": 0.5263700187206268,
      "rewards/cosine_scaled_reward": -0.16150296479463577,
      "rewards/format_reward": 0.375,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2722.4583435058594,
      "epoch": 0.07885714285714286,
      "grad_norm": 0.22437696158885956,
      "kl": 0.048919677734375,
      "learning_rate": 9.195171441101668e-07,
      "loss": 0.153,
      "reward": 0.05756654590368271,
      "reward_std": 0.4655684223398566,
      "rewards/cosine_scaled_reward": -0.17955005168914795,
      "rewards/format_reward": 0.4166666865348816,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3546.625,
      "epoch": 0.07942857142857143,
      "grad_norm": 0.14434434473514557,
      "kl": 0.04290771484375,
      "learning_rate": 9.177152042508077e-07,
      "loss": 0.016,
      "reward": -0.4627462103962898,
      "reward_std": 0.31377890706062317,
      "rewards/cosine_scaled_reward": -0.2730397693812847,
      "rewards/format_reward": 0.0833333358168602,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3403.2916870117188,
      "epoch": 0.08,
      "grad_norm": 0.2034330666065216,
      "kl": 0.041259765625,
      "learning_rate": 9.158953424711624e-07,
      "loss": 0.0195,
      "reward": 0.1140199825167656,
      "reward_std": 0.5117178149521351,
      "rewards/cosine_scaled_reward": -0.0471566803753376,
      "rewards/format_reward": 0.2083333432674408,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3293.041748046875,
      "epoch": 0.08057142857142857,
      "grad_norm": 0.1963217407464981,
      "kl": 0.06451416015625,
      "learning_rate": 9.140576474687263e-07,
      "loss": 0.0816,
      "reward": -0.41971880942583084,
      "reward_std": 0.46679312735795975,
      "rewards/cosine_scaled_reward": -0.3140260688960552,
      "rewards/format_reward": 0.2083333358168602,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3501.2916870117188,
      "epoch": 0.08114285714285714,
      "grad_norm": 0.15945497155189514,
      "kl": 0.06207275390625,
      "learning_rate": 9.122022088101613e-07,
      "loss": 0.0237,
      "reward": 0.19435557164251804,
      "reward_std": 0.5239780992269516,
      "rewards/cosine_scaled_reward": 0.034677786752581596,
      "rewards/format_reward": 0.1250000037252903,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2787.458335876465,
      "epoch": 0.08171428571428571,
      "grad_norm": 0.2447752207517624,
      "kl": 0.0467529296875,
      "learning_rate": 9.103291169269299e-07,
      "loss": 0.0355,
      "reward": 0.38285309448838234,
      "reward_std": 0.41509686410427094,
      "rewards/cosine_scaled_reward": 0.045593203976750374,
      "rewards/format_reward": 0.2916666679084301,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3170.9584350585938,
      "epoch": 0.08228571428571428,
      "grad_norm": 38.13862228393555,
      "kl": 6.65838623046875,
      "learning_rate": 9.084384631108882e-07,
      "loss": 0.1426,
      "reward": 0.41588541213423014,
      "reward_std": 1.0277538150548935,
      "rewards/cosine_scaled_reward": -0.04205727390944958,
      "rewards/format_reward": 0.5000000074505806,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3023.7083435058594,
      "epoch": 0.08285714285714285,
      "grad_norm": 0.24795998632907867,
      "kl": 0.23541259765625,
      "learning_rate": 9.065303395098358e-07,
      "loss": 0.1092,
      "reward": 0.08817495405673981,
      "reward_std": 0.6404087841510773,
      "rewards/cosine_scaled_reward": -0.0809125080704689,
      "rewards/format_reward": 0.2500000111758709,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2834.2916870117188,
      "epoch": 0.08342857142857144,
      "grad_norm": 0.1742977350950241,
      "kl": 0.05084228515625,
      "learning_rate": 9.046048391230247e-07,
      "loss": 0.0127,
      "reward": -0.02124733477830887,
      "reward_std": 0.6222574003040791,
      "rewards/cosine_scaled_reward": -0.17729033529758453,
      "rewards/format_reward": 0.3333333358168602,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3126.5,
      "epoch": 0.084,
      "grad_norm": 0.2359555959701538,
      "kl": 0.0499267578125,
      "learning_rate": 9.026620557966279e-07,
      "loss": 0.0858,
      "reward": 0.02666241116821766,
      "reward_std": 0.4308905638754368,
      "rewards/cosine_scaled_reward": -0.09083548840135336,
      "rewards/format_reward": 0.2083333432674408,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3065.1666870117188,
      "epoch": 0.08457142857142858,
      "grad_norm": 0.16794665157794952,
      "kl": 0.05181884765625,
      "learning_rate": 9.007020842191634e-07,
      "loss": -0.0189,
      "reward": -0.25704628229141235,
      "reward_std": 0.5457647405564785,
      "rewards/cosine_scaled_reward": -0.2951898043975234,
      "rewards/format_reward": 0.3333333358168602,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3517.9583740234375,
      "epoch": 0.08514285714285715,
      "grad_norm": 0.1912597119808197,
      "kl": 0.078857421875,
      "learning_rate": 8.987250199168808e-07,
      "loss": 0.0277,
      "reward": -0.3515687808394432,
      "reward_std": 0.3064217194914818,
      "rewards/cosine_scaled_reward": -0.2382843866944313,
      "rewards/format_reward": 0.1250000037252903,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3309.8333740234375,
      "epoch": 0.08571428571428572,
      "grad_norm": 0.17105424404144287,
      "kl": 0.0648193359375,
      "learning_rate": 8.967309592491052e-07,
      "loss": 0.0291,
      "reward": -0.26190581917762756,
      "reward_std": 0.4774948377162218,
      "rewards/cosine_scaled_reward": -0.19345290772616863,
      "rewards/format_reward": 0.125,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3336.7500610351562,
      "epoch": 0.08628571428571429,
      "grad_norm": 0.15193696320056915,
      "kl": 0.0372314453125,
      "learning_rate": 8.9471999940354e-07,
      "loss": 0.0409,
      "reward": 0.2141360342502594,
      "reward_std": 0.7047755531966686,
      "rewards/cosine_scaled_reward": -0.05959864519536495,
      "rewards/format_reward": 0.3333333432674408,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3453.0833740234375,
      "epoch": 0.08685714285714285,
      "grad_norm": 0.17530380189418793,
      "kl": 0.0565185546875,
      "learning_rate": 8.926922383915315e-07,
      "loss": 0.0401,
      "reward": -0.20536936819553375,
      "reward_std": 0.6435524728149176,
      "rewards/cosine_scaled_reward": -0.16518468409776688,
      "rewards/format_reward": 0.125,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3076.2083435058594,
      "epoch": 0.08742857142857142,
      "grad_norm": 0.2100270837545395,
      "kl": 0.05108642578125,
      "learning_rate": 8.906477750432903e-07,
      "loss": 0.1391,
      "reward": -0.07981336116790771,
      "reward_std": 0.5872795805335045,
      "rewards/cosine_scaled_reward": -0.14407330751419067,
      "rewards/format_reward": 0.2083333432674408,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3504.4166870117188,
      "epoch": 0.088,
      "grad_norm": 0.1730157434940338,
      "kl": 0.05682373046875,
      "learning_rate": 8.88586709003076e-07,
      "loss": 0.026,
      "reward": -0.5636200718581676,
      "reward_std": 0.1353142261505127,
      "rewards/cosine_scaled_reward": -0.30264334939420223,
      "rewards/format_reward": 0.0416666679084301,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3428.7916870117188,
      "epoch": 0.08857142857142856,
      "grad_norm": 0.1775522381067276,
      "kl": 0.050537109375,
      "learning_rate": 8.865091407243394e-07,
      "loss": 0.0571,
      "reward": -0.22037950158119202,
      "reward_std": 0.46343767642974854,
      "rewards/cosine_scaled_reward": -0.1518564149737358,
      "rewards/format_reward": 0.0833333358168602,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3031.916717529297,
      "epoch": 0.08914285714285715,
      "grad_norm": 0.2878535985946655,
      "kl": 0.06085205078125,
      "learning_rate": 8.844151714648274e-07,
      "loss": 0.1139,
      "reward": 0.004119843244552612,
      "reward_std": 0.5550502277910709,
      "rewards/cosine_scaled_reward": -0.1437734253704548,
      "rewards/format_reward": 0.291666679084301,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3031.9166870117188,
      "epoch": 0.08971428571428572,
      "grad_norm": 0.1691678762435913,
      "kl": 0.04876708984375,
      "learning_rate": 8.823049032816478e-07,
      "loss": 0.0713,
      "reward": -0.09758711606264114,
      "reward_std": 0.5461144000291824,
      "rewards/cosine_scaled_reward": -0.21546022966504097,
      "rewards/format_reward": 0.3333333358168602,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3353.8333740234375,
      "epoch": 0.09028571428571429,
      "grad_norm": 0.1852702796459198,
      "kl": 0.04815673828125,
      "learning_rate": 8.801784390262943e-07,
      "loss": 0.0788,
      "reward": -0.16877157613635063,
      "reward_std": 0.5984261110424995,
      "rewards/cosine_scaled_reward": -0.20938578806817532,
      "rewards/format_reward": 0.2500000111758709,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3472.5833740234375,
      "epoch": 0.09085714285714286,
      "grad_norm": 0.2011410892009735,
      "kl": 0.05548095703125,
      "learning_rate": 8.780358823396352e-07,
      "loss": 0.0726,
      "reward": -0.5162428542971611,
      "reward_std": 0.5920832827687263,
      "rewards/cosine_scaled_reward": -0.29978808760643005,
      "rewards/format_reward": 0.0833333358168602,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.09142857142857143,
      "grad_norm": 0.15513016283512115,
      "kl": 0.0465087890625,
      "learning_rate": 8.758773376468604e-07,
      "loss": 0.0002,
      "reward": -0.45894186943769455,
      "reward_std": 0.1620104108005762,
      "rewards/cosine_scaled_reward": -0.22947093471884727,
      "rewards/format_reward": 0.0,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.092,
      "grad_norm": 0.1544281244277954,
      "kl": 0.05731201171875,
      "learning_rate": 8.737029101523929e-07,
      "loss": 0.0002,
      "reward": -0.5140766091644764,
      "reward_std": 0.2021910808980465,
      "rewards/cosine_scaled_reward": -0.25703830271959305,
      "rewards/format_reward": 0.0,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3211.875,
      "epoch": 0.09257142857142857,
      "grad_norm": 0.18425820767879486,
      "kl": 0.04522705078125,
      "learning_rate": 8.715127058347614e-07,
      "loss": 0.0104,
      "reward": 0.019165851175785065,
      "reward_std": 0.553180105984211,
      "rewards/cosine_scaled_reward": -0.09458375349640846,
      "rewards/format_reward": 0.2083333432674408,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3459.7083740234375,
      "epoch": 0.09314285714285714,
      "grad_norm": 0.19152286648750305,
      "kl": 0.068115234375,
      "learning_rate": 8.693068314414344e-07,
      "loss": 0.0666,
      "reward": -0.32729392871260643,
      "reward_std": 0.5201658196747303,
      "rewards/cosine_scaled_reward": -0.18448030017316341,
      "rewards/format_reward": 0.0416666679084301,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3519.9583740234375,
      "epoch": 0.09371428571428571,
      "grad_norm": 0.15800310671329498,
      "kl": 0.04876708984375,
      "learning_rate": 8.670853944836176e-07,
      "loss": 0.0335,
      "reward": -0.4632922485470772,
      "reward_std": 0.4524738918989897,
      "rewards/cosine_scaled_reward": -0.27331279031932354,
      "rewards/format_reward": 0.0833333358168602,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.09428571428571429,
      "grad_norm": 0.1423046737909317,
      "kl": 0.04443359375,
      "learning_rate": 8.648485032310144e-07,
      "loss": 0.0002,
      "reward": -0.6841404587030411,
      "reward_std": 0.12779409438371658,
      "rewards/cosine_scaled_reward": -0.34207022935152054,
      "rewards/format_reward": 0.0,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.09485714285714286,
      "grad_norm": 0.15924735367298126,
      "kl": 0.0531005859375,
      "learning_rate": 8.625962667065487e-07,
      "loss": 0.0002,
      "reward": -0.8217453360557556,
      "reward_std": 0.14010480791330338,
      "rewards/cosine_scaled_reward": -0.4108726605772972,
      "rewards/format_reward": 0.0,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3299.3750610351562,
      "epoch": 0.09542857142857143,
      "grad_norm": 0.1801840364933014,
      "kl": 0.04791259765625,
      "learning_rate": 8.603287946810513e-07,
      "loss": 0.0996,
      "reward": 0.3397903465665877,
      "reward_std": 1.7125960290431976,
      "rewards/cosine_scaled_reward": 0.003228497225791216,
      "rewards/format_reward": 0.3333333395421505,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3347.5416870117188,
      "epoch": 0.096,
      "grad_norm": 0.1715705245733261,
      "kl": 0.05438232421875,
      "learning_rate": 8.580461976679099e-07,
      "loss": 0.0074,
      "reward": -0.49858966283500195,
      "reward_std": 0.5019380133599043,
      "rewards/cosine_scaled_reward": -0.33262816444039345,
      "rewards/format_reward": 0.1666666679084301,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3189.25,
      "epoch": 0.09657142857142857,
      "grad_norm": 0.2528628408908844,
      "kl": 0.04998779296875,
      "learning_rate": 8.557485869176825e-07,
      "loss": 0.0895,
      "reward": -0.13017432391643524,
      "reward_std": 0.38139653019607067,
      "rewards/cosine_scaled_reward": -0.16925383359193802,
      "rewards/format_reward": 0.2083333432674408,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.09714285714285714,
      "grad_norm": 0.14756138622760773,
      "kl": 0.0531005859375,
      "learning_rate": 8.534360744126753e-07,
      "loss": 0.0002,
      "reward": -0.36126868799328804,
      "reward_std": 0.25860733538866043,
      "rewards/cosine_scaled_reward": -0.18063434585928917,
      "rewards/format_reward": 0.0,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3454.625,
      "epoch": 0.09771428571428571,
      "grad_norm": 0.163909912109375,
      "kl": 0.050048828125,
      "learning_rate": 8.511087728614862e-07,
      "loss": -0.0115,
      "reward": -0.11545135825872421,
      "reward_std": 0.5508421286940575,
      "rewards/cosine_scaled_reward": -0.0785590149462223,
      "rewards/format_reward": 0.0416666679084301,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3559.1666870117188,
      "epoch": 0.09828571428571428,
      "grad_norm": 0.17620068788528442,
      "kl": 0.056884765625,
      "learning_rate": 8.487667956935087e-07,
      "loss": 0.01,
      "reward": -0.5306723043322563,
      "reward_std": 0.20070407167077065,
      "rewards/cosine_scaled_reward": -0.30700283497571945,
      "rewards/format_reward": 0.0833333358168602,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3125.8750610351562,
      "epoch": 0.09885714285714285,
      "grad_norm": 0.16078141331672668,
      "kl": 0.06512451171875,
      "learning_rate": 8.464102570534061e-07,
      "loss": -0.0308,
      "reward": -0.16053162794560194,
      "reward_std": 0.5758852250874043,
      "rewards/cosine_scaled_reward": -0.2469324842095375,
      "rewards/format_reward": 0.3333333358168602,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.09942857142857142,
      "grad_norm": 0.15852725505828857,
      "kl": 0.06536865234375,
      "learning_rate": 8.440392717955475e-07,
      "loss": 0.0003,
      "reward": -0.5079451501369476,
      "reward_std": 0.22260741889476776,
      "rewards/cosine_scaled_reward": -0.2539725750684738,
      "rewards/format_reward": 0.0,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3431.7500610351562,
      "epoch": 0.1,
      "grad_norm": 0.15926480293273926,
      "kl": 0.044921875,
      "learning_rate": 8.416539554784089e-07,
      "loss": 0.0673,
      "reward": -0.14099129289388657,
      "reward_std": 0.7832577079534531,
      "rewards/cosine_scaled_reward": -0.1746623208746314,
      "rewards/format_reward": 0.2083333358168602,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3245.3750610351562,
      "epoch": 0.10057142857142858,
      "grad_norm": 0.17859475314617157,
      "kl": 0.068115234375,
      "learning_rate": 8.392544243589427e-07,
      "loss": 0.0869,
      "reward": -0.0380060151219368,
      "reward_std": 0.6192945204675198,
      "rewards/cosine_scaled_reward": -0.1440030261874199,
      "rewards/format_reward": 0.2500000074505806,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3133.875,
      "epoch": 0.10114285714285715,
      "grad_norm": 0.15891003608703613,
      "kl": 0.04754638671875,
      "learning_rate": 8.368407953869103e-07,
      "loss": 0.0181,
      "reward": 0.08751339092850685,
      "reward_std": 0.16736168786883354,
      "rewards/cosine_scaled_reward": -0.08124331943690777,
      "rewards/format_reward": 0.25,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3558.6666870117188,
      "epoch": 0.10171428571428572,
      "grad_norm": 0.14599938690662384,
      "kl": 0.04962158203125,
      "learning_rate": 8.344131861991828e-07,
      "loss": 0.0082,
      "reward": -0.16755510121583939,
      "reward_std": 0.7793578952550888,
      "rewards/cosine_scaled_reward": -0.1462775506079197,
      "rewards/format_reward": 0.125,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3514.75,
      "epoch": 0.10228571428571429,
      "grad_norm": 0.2012149542570114,
      "kl": 0.055419921875,
      "learning_rate": 8.319717151140072e-07,
      "loss": 0.027,
      "reward": -0.6090946160256863,
      "reward_std": 0.3212553486227989,
      "rewards/cosine_scaled_reward": -0.3462139815092087,
      "rewards/format_reward": 0.0833333358168602,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.10285714285714286,
      "grad_norm": 0.15520262718200684,
      "kl": 0.05035400390625,
      "learning_rate": 8.295165011252396e-07,
      "loss": 0.0002,
      "reward": -0.44433633610606194,
      "reward_std": 0.4537891875952482,
      "rewards/cosine_scaled_reward": -0.2638348173350096,
      "rewards/format_reward": 0.0833333358168602,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.10342857142857143,
      "grad_norm": 0.1958397924900055,
      "kl": 0.04217529296875,
      "learning_rate": 8.270476638965461e-07,
      "loss": 0.0002,
      "reward": -0.19999001920223236,
      "reward_std": 0.5615897215902805,
      "rewards/cosine_scaled_reward": -0.20416167378425598,
      "rewards/format_reward": 0.2083333395421505,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.104,
      "grad_norm": 0.15346062183380127,
      "kl": 0.05303955078125,
      "learning_rate": 8.245653237555705e-07,
      "loss": 0.0002,
      "reward": -0.7306937873363495,
      "reward_std": 0.16790879145264626,
      "rewards/cosine_scaled_reward": -0.36534689366817474,
      "rewards/format_reward": 0.0,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3526.5000610351562,
      "epoch": 0.10457142857142857,
      "grad_norm": 0.16117896139621735,
      "kl": 0.052734375,
      "learning_rate": 8.220696016880687e-07,
      "loss": 0.0222,
      "reward": -0.4183296374976635,
      "reward_std": 0.4902483597397804,
      "rewards/cosine_scaled_reward": -0.2924981564283371,
      "rewards/format_reward": 0.1666666679084301,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3427.625,
      "epoch": 0.10514285714285715,
      "grad_norm": 0.1658228635787964,
      "kl": 0.0543212890625,
      "learning_rate": 8.195606193320136e-07,
      "loss": 0.0467,
      "reward": -0.23194494098424911,
      "reward_std": 0.825510136783123,
      "rewards/cosine_scaled_reward": -0.15763913467526436,
      "rewards/format_reward": 0.0833333358168602,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2791.125,
      "epoch": 0.10571428571428572,
      "grad_norm": 0.42574554681777954,
      "kl": 0.0634765625,
      "learning_rate": 8.170384989716657e-07,
      "loss": 0.0135,
      "reward": -0.48800092935562134,
      "reward_std": 0.21989084407687187,
      "rewards/cosine_scaled_reward": -0.3481671344488859,
      "rewards/format_reward": 0.2083333432674408,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2866.3333740234375,
      "epoch": 0.10628571428571429,
      "grad_norm": 0.20383749902248383,
      "kl": 0.05389404296875,
      "learning_rate": 8.145033635316128e-07,
      "loss": 0.0027,
      "reward": 0.5907801762223244,
      "reward_std": 0.6398285947507247,
      "rewards/cosine_scaled_reward": 0.08705675601959229,
      "rewards/format_reward": 0.4166666679084301,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.10685714285714286,
      "grad_norm": 0.1624538004398346,
      "kl": 0.05572509765625,
      "learning_rate": 8.119553365707802e-07,
      "loss": 0.0002,
      "reward": -0.5849527418613434,
      "reward_std": 0.18283319287002087,
      "rewards/cosine_scaled_reward": -0.2924763709306717,
      "rewards/format_reward": 0.0,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.10742857142857143,
      "grad_norm": 0.14820487797260284,
      "kl": 0.05047607421875,
      "learning_rate": 8.093945422764069e-07,
      "loss": 0.0002,
      "reward": -0.5387367829680443,
      "reward_std": 0.39942592941224575,
      "rewards/cosine_scaled_reward": -0.2902017207816243,
      "rewards/format_reward": 0.0416666679084301,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3558.5833740234375,
      "epoch": 0.108,
      "grad_norm": 0.1541958600282669,
      "kl": 0.04547119140625,
      "learning_rate": 8.068211054579943e-07,
      "loss": 0.0146,
      "reward": -0.5106032621115446,
      "reward_std": 0.5608320534229279,
      "rewards/cosine_scaled_reward": -0.2969682924449444,
      "rewards/format_reward": 0.0833333358168602,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3313.5833740234375,
      "epoch": 0.10857142857142857,
      "grad_norm": 0.2825907766819,
      "kl": 0.0518798828125,
      "learning_rate": 8.04235151541222e-07,
      "loss": 0.0952,
      "reward": -0.5725184977054596,
      "reward_std": 0.32253118604421616,
      "rewards/cosine_scaled_reward": -0.3487592488527298,
      "rewards/format_reward": 0.125,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2769.500045776367,
      "epoch": 0.10914285714285714,
      "grad_norm": 0.751027524471283,
      "kl": 0.06817626953125,
      "learning_rate": 8.01636806561836e-07,
      "loss": -0.0603,
      "reward": -0.14048952236771584,
      "reward_std": 0.46144552156329155,
      "rewards/cosine_scaled_reward": -0.23691142722964287,
      "rewards/format_reward": 0.3333333432674408,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3254.0833740234375,
      "epoch": 0.10971428571428571,
      "grad_norm": 0.17224395275115967,
      "kl": 0.06939697265625,
      "learning_rate": 7.990261971595048e-07,
      "loss": -0.0575,
      "reward": 0.1011834591627121,
      "reward_std": 0.3935512360185385,
      "rewards/cosine_scaled_reward": -0.07440828531980515,
      "rewards/format_reward": 0.25,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.11028571428571429,
      "grad_norm": 0.13426987826824188,
      "kl": 0.04534912109375,
      "learning_rate": 7.964034505716476e-07,
      "loss": 0.0002,
      "reward": -0.6477106511592865,
      "reward_std": 0.0925671923905611,
      "rewards/cosine_scaled_reward": -0.32385531067848206,
      "rewards/format_reward": 0.0,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2811.875045776367,
      "epoch": 0.11085714285714286,
      "grad_norm": 0.2597994804382324,
      "kl": 0.08905029296875,
      "learning_rate": 7.93768694627233e-07,
      "loss": -0.0486,
      "reward": 0.23529191315174103,
      "reward_std": 0.8221290409564972,
      "rewards/cosine_scaled_reward": -0.04902072250843048,
      "rewards/format_reward": 0.3333333358168602,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3352.5833740234375,
      "epoch": 0.11142857142857143,
      "grad_norm": 0.23991723358631134,
      "kl": 0.060546875,
      "learning_rate": 7.911220577405484e-07,
      "loss": 0.0555,
      "reward": -0.0027063414454460144,
      "reward_std": 0.608617402613163,
      "rewards/cosine_scaled_reward": -0.1263531558215618,
      "rewards/format_reward": 0.2500000074505806,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2819.0416717529297,
      "epoch": 0.112,
      "grad_norm": 0.24572671949863434,
      "kl": 0.07049560546875,
      "learning_rate": 7.884636689049422e-07,
      "loss": 0.0432,
      "reward": -0.14430035650730133,
      "reward_std": 0.4976443909108639,
      "rewards/cosine_scaled_reward": -0.17631685733795166,
      "rewards/format_reward": 0.2083333432674408,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3383.0416870117188,
      "epoch": 0.11257142857142857,
      "grad_norm": 0.15762414038181305,
      "kl": 0.05340576171875,
      "learning_rate": 7.857936576865356e-07,
      "loss": 0.0622,
      "reward": -0.433723047375679,
      "reward_std": 0.5919782798737288,
      "rewards/cosine_scaled_reward": -0.2793615125119686,
      "rewards/format_reward": 0.125,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2997.4166870117188,
      "epoch": 0.11314285714285714,
      "grad_norm": 0.1949368119239807,
      "kl": 0.0482177734375,
      "learning_rate": 7.831121542179086e-07,
      "loss": 0.0038,
      "reward": 0.30401055328547955,
      "reward_std": 0.3298298269510269,
      "rewards/cosine_scaled_reward": 0.02700526174157858,
      "rewards/format_reward": 0.25,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3476.9583740234375,
      "epoch": 0.11371428571428571,
      "grad_norm": 0.16105806827545166,
      "kl": 0.04974365234375,
      "learning_rate": 7.804192891917571e-07,
      "loss": 0.0267,
      "reward": 0.08356641232967377,
      "reward_std": 0.5651987139135599,
      "rewards/cosine_scaled_reward": -0.062383463606238365,
      "rewards/format_reward": 0.2083333395421505,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3510.125,
      "epoch": 0.11428571428571428,
      "grad_norm": 0.1825430691242218,
      "kl": 0.06591796875,
      "learning_rate": 7.777151938545235e-07,
      "loss": 0.0202,
      "reward": -0.3780656084418297,
      "reward_std": 0.22097079828381538,
      "rewards/cosine_scaled_reward": -0.25153281539678574,
      "rewards/format_reward": 0.125,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3476.125,
      "epoch": 0.11485714285714285,
      "grad_norm": 0.15311142802238464,
      "kl": 0.06298828125,
      "learning_rate": 7.75e-07,
      "loss": 0.068,
      "reward": -0.3960627820342779,
      "reward_std": 0.26806606631726027,
      "rewards/cosine_scaled_reward": -0.21886471938341856,
      "rewards/format_reward": 0.0416666679084301,
      "step": 201
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3063.3333435058594,
      "epoch": 0.11542857142857142,
      "grad_norm": 0.2661282420158386,
      "kl": 0.050048828125,
      "learning_rate": 7.72273839962904e-07,
      "loss": 0.1064,
      "reward": -0.12841053307056427,
      "reward_std": 0.5312525816261768,
      "rewards/cosine_scaled_reward": -0.16837193816900253,
      "rewards/format_reward": 0.2083333432674408,
      "step": 202
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3433.9583740234375,
      "epoch": 0.116,
      "grad_norm": 0.20914843678474426,
      "kl": 0.049560546875,
      "learning_rate": 7.695368466124296e-07,
      "loss": 0.0853,
      "reward": -0.22245256043970585,
      "reward_std": 0.3599269762635231,
      "rewards/cosine_scaled_reward": -0.15289294999092817,
      "rewards/format_reward": 0.0833333358168602,
      "step": 203
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2902.2083435058594,
      "epoch": 0.11657142857142858,
      "grad_norm": 0.18525813519954681,
      "kl": 0.04925537109375,
      "learning_rate": 7.667891533457718e-07,
      "loss": -0.0045,
      "reward": -0.38538965582847595,
      "reward_std": 0.27233337238430977,
      "rewards/cosine_scaled_reward": -0.29686148278415203,
      "rewards/format_reward": 0.2083333432674408,
      "step": 204
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3465.875,
      "epoch": 0.11714285714285715,
      "grad_norm": 0.27074506878852844,
      "kl": 0.0501708984375,
      "learning_rate": 7.640308940816239e-07,
      "loss": 0.0666,
      "reward": -0.20100652147084475,
      "reward_std": 0.5179510489106178,
      "rewards/cosine_scaled_reward": -0.14216993749141693,
      "rewards/format_reward": 0.0833333358168602,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3581.5833740234375,
      "epoch": 0.11771428571428572,
      "grad_norm": 0.14444300532341003,
      "kl": 0.06109619140625,
      "learning_rate": 7.612622032536507e-07,
      "loss": 0.0009,
      "reward": -0.007464568130671978,
      "reward_std": 0.5384093690663576,
      "rewards/cosine_scaled_reward": -0.0662322910502553,
      "rewards/format_reward": 0.1250000037252903,
      "step": 206
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3480.5833740234375,
      "epoch": 0.11828571428571429,
      "grad_norm": 0.17702296376228333,
      "kl": 0.04974365234375,
      "learning_rate": 7.584832158039378e-07,
      "loss": 0.0625,
      "reward": -0.43253427371382713,
      "reward_std": 0.3365586632862687,
      "rewards/cosine_scaled_reward": -0.2579338103532791,
      "rewards/format_reward": 0.0833333358168602,
      "step": 207
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2894.500030517578,
      "epoch": 0.11885714285714286,
      "grad_norm": 0.17304745316505432,
      "kl": 0.06671142578125,
      "learning_rate": 7.556940671764124e-07,
      "loss": -0.0349,
      "reward": 0.5662415772676468,
      "reward_std": 0.7728973366320133,
      "rewards/cosine_scaled_reward": 0.0747874528169632,
      "rewards/format_reward": 0.4166666716337204,
      "step": 208
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2965.875,
      "epoch": 0.11942857142857143,
      "grad_norm": 0.18019767105579376,
      "kl": 0.04754638671875,
      "learning_rate": 7.528948933102438e-07,
      "loss": 0.0153,
      "reward": 0.2692076712846756,
      "reward_std": 0.359660305082798,
      "rewards/cosine_scaled_reward": -0.0320628322660923,
      "rewards/format_reward": 0.3333333358168602,
      "step": 209
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3572.5,
      "epoch": 0.12,
      "grad_norm": 0.17177176475524902,
      "kl": 0.06103515625,
      "learning_rate": 7.500858306332172e-07,
      "loss": 0.0069,
      "reward": -0.5149929281324148,
      "reward_std": 0.27825676556676626,
      "rewards/cosine_scaled_reward": -0.278329792432487,
      "rewards/format_reward": 0.0416666679084301,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3285.2083740234375,
      "epoch": 0.12057142857142857,
      "grad_norm": 0.1756087690591812,
      "kl": 0.05584716796875,
      "learning_rate": 7.472670160550848e-07,
      "loss": 0.0255,
      "reward": -0.08769790083169937,
      "reward_std": 0.4134560525417328,
      "rewards/cosine_scaled_reward": -0.12718229368329048,
      "rewards/format_reward": 0.1666666716337204,
      "step": 211
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.12114285714285715,
      "grad_norm": 0.15736572444438934,
      "kl": 0.06146240234375,
      "learning_rate": 7.444385869608921e-07,
      "loss": 0.0002,
      "reward": -0.09432668518275023,
      "reward_std": 0.6111114136874676,
      "rewards/cosine_scaled_reward": -0.06799666350707412,
      "rewards/format_reward": 0.0416666679084301,
      "step": 212
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3303.0833740234375,
      "epoch": 0.12171428571428572,
      "grad_norm": 0.13703764975070953,
      "kl": 0.04400634765625,
      "learning_rate": 7.416006812042827e-07,
      "loss": -0.0403,
      "reward": 0.23149769008159637,
      "reward_std": 0.8395795077085495,
      "rewards/cosine_scaled_reward": -0.009251154959201813,
      "rewards/format_reward": 0.2500000111758709,
      "step": 213
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2751.875,
      "epoch": 0.12228571428571429,
      "grad_norm": 0.22418977320194244,
      "kl": 0.05072021484375,
      "learning_rate": 7.387534371007797e-07,
      "loss": 0.0599,
      "reward": -0.3867769241333008,
      "reward_std": 0.32847015745937824,
      "rewards/cosine_scaled_reward": -0.3392217978835106,
      "rewards/format_reward": 0.291666679084301,
      "step": 214
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.12285714285714286,
      "grad_norm": 0.16271401941776276,
      "kl": 0.0491943359375,
      "learning_rate": 7.358969934210438e-07,
      "loss": 0.0002,
      "reward": -0.763099730014801,
      "reward_std": 0.22296234592795372,
      "rewards/cosine_scaled_reward": -0.3815498650074005,
      "rewards/format_reward": 0.0,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2543.2084350585938,
      "epoch": 0.12342857142857143,
      "grad_norm": 0.19669151306152344,
      "kl": 0.06500244140625,
      "learning_rate": 7.330314893841101e-07,
      "loss": 0.0565,
      "reward": 1.6257893741130829,
      "reward_std": 0.8823383823037148,
      "rewards/cosine_scaled_reward": 0.4378946740180254,
      "rewards/format_reward": 0.7500000074505806,
      "step": 216
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3438.5833740234375,
      "epoch": 0.124,
      "grad_norm": 0.19945161044597626,
      "kl": 0.044189453125,
      "learning_rate": 7.301570646506027e-07,
      "loss": 0.06,
      "reward": -0.2967698462307453,
      "reward_std": 0.30110811814665794,
      "rewards/cosine_scaled_reward": -0.19005159474909306,
      "rewards/format_reward": 0.0833333358168602,
      "step": 217
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3469.9584350585938,
      "epoch": 0.12457142857142857,
      "grad_norm": 0.24981477856636047,
      "kl": 0.056640625,
      "learning_rate": 7.27273859315928e-07,
      "loss": 0.0418,
      "reward": 0.14413084089756012,
      "reward_std": 0.7537662945687771,
      "rewards/cosine_scaled_reward": -0.05293455999344587,
      "rewards/format_reward": 0.2500000037252903,
      "step": 218
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2854.4583435058594,
      "epoch": 0.12514285714285714,
      "grad_norm": 0.1918228268623352,
      "kl": 0.0614013671875,
      "learning_rate": 7.243820139034464e-07,
      "loss": -0.027,
      "reward": 0.359823577105999,
      "reward_std": 0.3150374963879585,
      "rewards/cosine_scaled_reward": -0.0700882226228714,
      "rewards/format_reward": 0.5,
      "step": 219
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3304.8333740234375,
      "epoch": 0.12571428571428572,
      "grad_norm": 0.17970924079418182,
      "kl": 0.04547119140625,
      "learning_rate": 7.214816693576234e-07,
      "loss": 0.0824,
      "reward": 0.4905561991035938,
      "reward_std": 1.0811701826751232,
      "rewards/cosine_scaled_reward": 0.036944760009646416,
      "rewards/format_reward": 0.4166666716337204,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2944.4583740234375,
      "epoch": 0.12628571428571428,
      "grad_norm": 0.20005832612514496,
      "kl": 0.0582275390625,
      "learning_rate": 7.185729670371604e-07,
      "loss": 0.1705,
      "reward": 0.11963904649019241,
      "reward_std": 0.9095522128045559,
      "rewards/cosine_scaled_reward": -0.10684715583920479,
      "rewards/format_reward": 0.3333333395421505,
      "step": 221
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2707.9583587646484,
      "epoch": 0.12685714285714286,
      "grad_norm": 0.29521119594573975,
      "kl": 0.0628662109375,
      "learning_rate": 7.156560487081051e-07,
      "loss": -0.0572,
      "reward": 0.17322428710758686,
      "reward_std": 0.6296472698450089,
      "rewards/cosine_scaled_reward": -0.12172118946909904,
      "rewards/format_reward": 0.4166666716337204,
      "step": 222
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3465.9583740234375,
      "epoch": 0.12742857142857142,
      "grad_norm": 0.20125603675842285,
      "kl": 0.06634521484375,
      "learning_rate": 7.127310565369415e-07,
      "loss": 0.0676,
      "reward": -0.5744385868310928,
      "reward_std": 0.25765218771994114,
      "rewards/cosine_scaled_reward": -0.308052621781826,
      "rewards/format_reward": 0.0416666679084301,
      "step": 223
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3561.2916870117188,
      "epoch": 0.128,
      "grad_norm": 0.21267585456371307,
      "kl": 0.0467529296875,
      "learning_rate": 7.097981330836616e-07,
      "loss": 0.0093,
      "reward": -0.16986877843737602,
      "reward_std": 0.8435531742870808,
      "rewards/cosine_scaled_reward": -0.1474343854933977,
      "rewards/format_reward": 0.1250000037252903,
      "step": 224
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3497.125,
      "epoch": 0.12857142857142856,
      "grad_norm": 0.18112824857234955,
      "kl": 0.05999755859375,
      "learning_rate": 7.068574212948169e-07,
      "loss": 0.0182,
      "reward": -0.0508711040019989,
      "reward_std": 0.6743626110255718,
      "rewards/cosine_scaled_reward": -0.10876888036727905,
      "rewards/format_reward": 0.1666666716337204,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3313.5,
      "epoch": 0.12914285714285714,
      "grad_norm": 0.176024928689003,
      "kl": 0.05694580078125,
      "learning_rate": 7.039090644965509e-07,
      "loss": -0.0426,
      "reward": 0.4725576564669609,
      "reward_std": 0.18281785771250725,
      "rewards/cosine_scaled_reward": 0.11127886641770601,
      "rewards/format_reward": 0.25,
      "step": 226
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2694.4166717529297,
      "epoch": 0.12971428571428573,
      "grad_norm": 0.20780931413173676,
      "kl": 0.04412841796875,
      "learning_rate": 7.009532063876148e-07,
      "loss": 0.1619,
      "reward": 0.05987407639622688,
      "reward_std": 0.787646472454071,
      "rewards/cosine_scaled_reward": -0.1575629599392414,
      "rewards/format_reward": 0.3750000037252903,
      "step": 227
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2700.6666870117188,
      "epoch": 0.13028571428571428,
      "grad_norm": 0.20493757724761963,
      "kl": 0.0606689453125,
      "learning_rate": 6.979899910323624e-07,
      "loss": 0.0601,
      "reward": 0.9252843372523785,
      "reward_std": 0.8674749806523323,
      "rewards/cosine_scaled_reward": 0.21264216862618923,
      "rewards/format_reward": 0.5,
      "step": 228
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2944.9583740234375,
      "epoch": 0.13085714285714287,
      "grad_norm": 0.17230452597141266,
      "kl": 0.052490234375,
      "learning_rate": 6.950195628537299e-07,
      "loss": -0.0099,
      "reward": -0.023678046971326694,
      "reward_std": 0.5056402957998216,
      "rewards/cosine_scaled_reward": -0.19933902844786644,
      "rewards/format_reward": 0.3750000037252903,
      "step": 229
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3560.6666870117188,
      "epoch": 0.13142857142857142,
      "grad_norm": 0.16569606959819794,
      "kl": 0.048095703125,
      "learning_rate": 6.920420666261961e-07,
      "loss": 0.013,
      "reward": -0.08609153889119625,
      "reward_std": 0.5831933580338955,
      "rewards/cosine_scaled_reward": -0.10554578248411417,
      "rewards/format_reward": 0.1250000037252903,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3443.7916870117188,
      "epoch": 0.132,
      "grad_norm": 0.17575590312480927,
      "kl": 0.0565185546875,
      "learning_rate": 6.890576474687263e-07,
      "loss": 0.0443,
      "reward": 0.044117134995758533,
      "reward_std": 0.6273631011135876,
      "rewards/cosine_scaled_reward": -0.040441428776830435,
      "rewards/format_reward": 0.1250000037252903,
      "step": 231
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2883.8333435058594,
      "epoch": 0.13257142857142856,
      "grad_norm": 0.21763095259666443,
      "kl": 0.0523681640625,
      "learning_rate": 6.860664508377001e-07,
      "loss": 0.026,
      "reward": -0.18567287921905518,
      "reward_std": 0.4521710118278861,
      "rewards/cosine_scaled_reward": -0.217836432158947,
      "rewards/format_reward": 0.25,
      "step": 232
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.13314285714285715,
      "grad_norm": 0.16012197732925415,
      "kl": 0.05657958984375,
      "learning_rate": 6.83068622519821e-07,
      "loss": 0.0002,
      "reward": -0.4914732947945595,
      "reward_std": 0.17538534849882126,
      "rewards/cosine_scaled_reward": -0.24573664739727974,
      "rewards/format_reward": 0.0,
      "step": 233
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.1337142857142857,
      "grad_norm": 0.17368990182876587,
      "kl": 0.0556640625,
      "learning_rate": 6.800643086250121e-07,
      "loss": 0.0002,
      "reward": -0.5374301336705685,
      "reward_std": 0.2229807935655117,
      "rewards/cosine_scaled_reward": -0.26871506683528423,
      "rewards/format_reward": 0.0,
      "step": 234
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2889.9583435058594,
      "epoch": 0.13428571428571429,
      "grad_norm": 0.18229198455810547,
      "kl": 0.06298828125,
      "learning_rate": 6.770536555792944e-07,
      "loss": 0.0052,
      "reward": 0.22926755994558334,
      "reward_std": 0.5730621181428432,
      "rewards/cosine_scaled_reward": -0.010366253554821014,
      "rewards/format_reward": 0.25,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2836.5000610351562,
      "epoch": 0.13485714285714287,
      "grad_norm": 0.19761309027671814,
      "kl": 0.06787109375,
      "learning_rate": 6.740368101176495e-07,
      "loss": 0.0007,
      "reward": 0.009470507502555847,
      "reward_std": 0.4301174655556679,
      "rewards/cosine_scaled_reward": -0.30776476114988327,
      "rewards/format_reward": 0.625,
      "step": 236
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2920.541717529297,
      "epoch": 0.13542857142857143,
      "grad_norm": 0.4432980418205261,
      "kl": 0.07293701171875,
      "learning_rate": 6.710139192768694e-07,
      "loss": 0.2146,
      "reward": -0.30785553343594074,
      "reward_std": 0.3011339120566845,
      "rewards/cosine_scaled_reward": -0.299761101603508,
      "rewards/format_reward": 0.291666679084301,
      "step": 237
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3406.5000610351562,
      "epoch": 0.136,
      "grad_norm": 0.1461995542049408,
      "kl": 0.0609130859375,
      "learning_rate": 6.679851303883891e-07,
      "loss": 0.0953,
      "reward": -0.4275861941277981,
      "reward_std": 0.42392516881227493,
      "rewards/cosine_scaled_reward": -0.3179597780108452,
      "rewards/format_reward": 0.2083333395421505,
      "step": 238
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3582.4583740234375,
      "epoch": 0.13657142857142857,
      "grad_norm": 0.14785300195217133,
      "kl": 0.05682373046875,
      "learning_rate": 6.649505910711058e-07,
      "loss": 0.0009,
      "reward": -0.2121518924832344,
      "reward_std": 0.45589612051844597,
      "rewards/cosine_scaled_reward": -0.1269092820584774,
      "rewards/format_reward": 0.0416666679084301,
      "step": 239
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.13714285714285715,
      "grad_norm": 0.17068707942962646,
      "kl": 0.0528564453125,
      "learning_rate": 6.619104492241847e-07,
      "loss": 0.0002,
      "reward": -0.6798624247312546,
      "reward_std": 0.14933781139552593,
      "rewards/cosine_scaled_reward": -0.3399312049150467,
      "rewards/format_reward": 0.0,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2797.2916870117188,
      "epoch": 0.1377142857142857,
      "grad_norm": 0.2396436184644699,
      "kl": 0.05389404296875,
      "learning_rate": 6.588648530198504e-07,
      "loss": 0.0737,
      "reward": 0.3082697440404445,
      "reward_std": 0.345156442373991,
      "rewards/cosine_scaled_reward": -0.033365145325660706,
      "rewards/format_reward": 0.375,
      "step": 241
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.1382857142857143,
      "grad_norm": 0.21391235291957855,
      "kl": 0.07708740234375,
      "learning_rate": 6.558139508961654e-07,
      "loss": 0.0003,
      "reward": -0.508253276348114,
      "reward_std": 0.14199923910200596,
      "rewards/cosine_scaled_reward": -0.254126638174057,
      "rewards/format_reward": 0.0,
      "step": 242
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.13885714285714285,
      "grad_norm": 0.16383755207061768,
      "kl": 0.070068359375,
      "learning_rate": 6.527578915497951e-07,
      "loss": 0.0003,
      "reward": -0.6054684594273567,
      "reward_std": 0.18681432865560055,
      "rewards/cosine_scaled_reward": -0.30273422971367836,
      "rewards/format_reward": 0.0,
      "step": 243
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2971.6666870117188,
      "epoch": 0.13942857142857143,
      "grad_norm": 0.5523943305015564,
      "kl": 0.07171630859375,
      "learning_rate": 6.496968239287603e-07,
      "loss": 0.2458,
      "reward": -0.2852616235613823,
      "reward_std": 0.6826003473252058,
      "rewards/cosine_scaled_reward": -0.28846415132284164,
      "rewards/format_reward": 0.2916666753590107,
      "step": 244
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3252.5833740234375,
      "epoch": 0.14,
      "grad_norm": 0.34070122241973877,
      "kl": 0.0665283203125,
      "learning_rate": 6.466308972251785e-07,
      "loss": 0.1237,
      "reward": -0.17518402566201985,
      "reward_std": 0.9328324533998966,
      "rewards/cosine_scaled_reward": -0.19175868202000856,
      "rewards/format_reward": 0.2083333358168602,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3163.6666870117188,
      "epoch": 0.14057142857142857,
      "grad_norm": 0.21714583039283752,
      "kl": 0.0450439453125,
      "learning_rate": 6.435602608679916e-07,
      "loss": 0.0759,
      "reward": 0.32051989436149597,
      "reward_std": 0.785211868584156,
      "rewards/cosine_scaled_reward": 0.03525993227958679,
      "rewards/format_reward": 0.2500000111758709,
      "step": 246
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.14114285714285715,
      "grad_norm": 0.14294424653053284,
      "kl": 0.05615234375,
      "learning_rate": 6.404850645156841e-07,
      "loss": 0.0002,
      "reward": -0.5327610298991203,
      "reward_std": 0.23994574137032032,
      "rewards/cosine_scaled_reward": -0.26638051867485046,
      "rewards/format_reward": 0.0,
      "step": 247
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2089.9166870117188,
      "epoch": 0.1417142857142857,
      "grad_norm": 0.6865227222442627,
      "kl": 0.06201171875,
      "learning_rate": 6.374054580489873e-07,
      "loss": 0.2536,
      "reward": 0.7073055021464825,
      "reward_std": 0.8850769177079201,
      "rewards/cosine_scaled_reward": 0.04115273058414459,
      "rewards/format_reward": 0.6250000149011612,
      "step": 248
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3493.0,
      "epoch": 0.1422857142857143,
      "grad_norm": 0.2024909406900406,
      "kl": 0.052978515625,
      "learning_rate": 6.343215915635761e-07,
      "loss": 0.0052,
      "reward": -0.4589925929903984,
      "reward_std": 0.3254140354692936,
      "rewards/cosine_scaled_reward": -0.3336629644036293,
      "rewards/format_reward": 0.2083333432674408,
      "step": 249
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3328.75,
      "epoch": 0.14285714285714285,
      "grad_norm": 0.16820330917835236,
      "kl": 0.0540771484375,
      "learning_rate": 6.31233615362752e-07,
      "loss": 0.0172,
      "reward": 0.03026793897151947,
      "reward_std": 0.5065001584589481,
      "rewards/cosine_scaled_reward": -0.08903270214796066,
      "rewards/format_reward": 0.2083333432674408,
      "step": 250
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2929.4583435058594,
      "epoch": 0.14342857142857143,
      "grad_norm": 0.16301412880420685,
      "kl": 0.05047607421875,
      "learning_rate": 6.281416799501187e-07,
      "loss": -0.0337,
      "reward": 0.12225878238677979,
      "reward_std": 0.5798847824335098,
      "rewards/cosine_scaled_reward": -0.1472039744257927,
      "rewards/format_reward": 0.4166666716337204,
      "step": 251
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3335.9166870117188,
      "epoch": 0.144,
      "grad_norm": 0.41827014088630676,
      "kl": 0.0655517578125,
      "learning_rate": 6.25045936022246e-07,
      "loss": 0.1143,
      "reward": -0.3226817846298218,
      "reward_std": 0.5383487045764923,
      "rewards/cosine_scaled_reward": -0.2446742206811905,
      "rewards/format_reward": 0.1666666679084301,
      "step": 252
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3154.5416870117188,
      "epoch": 0.14457142857142857,
      "grad_norm": 0.2202560156583786,
      "kl": 0.320556640625,
      "learning_rate": 6.219465344613258e-07,
      "loss": 0.0519,
      "reward": 0.08730845898389816,
      "reward_std": 0.3982619745656848,
      "rewards/cosine_scaled_reward": -0.060512442141771317,
      "rewards/format_reward": 0.2083333432674408,
      "step": 253
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2991.75,
      "epoch": 0.14514285714285713,
      "grad_norm": 0.1752176582813263,
      "kl": 0.037750244140625,
      "learning_rate": 6.188436263278172e-07,
      "loss": 0.0455,
      "reward": -0.2120664268732071,
      "reward_std": 0.19734735041856766,
      "rewards/cosine_scaled_reward": -0.23103319853544235,
      "rewards/format_reward": 0.25,
      "step": 254
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3577.4583740234375,
      "epoch": 0.1457142857142857,
      "grad_norm": 0.19093064963817596,
      "kl": 0.0672607421875,
      "learning_rate": 6.157373628530852e-07,
      "loss": 0.0024,
      "reward": -0.43620166182518005,
      "reward_std": 0.5630457103252411,
      "rewards/cosine_scaled_reward": -0.259767509996891,
      "rewards/format_reward": 0.0833333358168602,
      "step": 255
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.1462857142857143,
      "grad_norm": 0.27532970905303955,
      "kl": 0.04736328125,
      "learning_rate": 6.126278954320294e-07,
      "loss": 0.0002,
      "reward": -0.7111312747001648,
      "reward_std": 0.19976815208792686,
      "rewards/cosine_scaled_reward": -0.3555656373500824,
      "rewards/format_reward": 0.0,
      "step": 256
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3072.7916870117188,
      "epoch": 0.14685714285714285,
      "grad_norm": 0.2951967418193817,
      "kl": 0.06939697265625,
      "learning_rate": 6.095153756157051e-07,
      "loss": 0.1916,
      "reward": -0.30195215344429016,
      "reward_std": 0.5967418141663074,
      "rewards/cosine_scaled_reward": -0.23430940508842468,
      "rewards/format_reward": 0.1666666716337204,
      "step": 257
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2571.916717529297,
      "epoch": 0.14742857142857144,
      "grad_norm": 0.24137046933174133,
      "kl": 0.0477294921875,
      "learning_rate": 6.06399955103937e-07,
      "loss": -0.0004,
      "reward": 0.16775222728028893,
      "reward_std": 0.5024616029113531,
      "rewards/cosine_scaled_reward": -0.1661239117383957,
      "rewards/format_reward": 0.5000000111758709,
      "step": 258
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.148,
      "grad_norm": 0.15327315032482147,
      "kl": 0.055908203125,
      "learning_rate": 6.032817857379256e-07,
      "loss": 0.0002,
      "reward": -0.6158068254590034,
      "reward_std": 0.23382795974612236,
      "rewards/cosine_scaled_reward": -0.3079034052789211,
      "rewards/format_reward": 0.0,
      "step": 259
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3555.75,
      "epoch": 0.14857142857142858,
      "grad_norm": 0.13946880400180817,
      "kl": 0.0531005859375,
      "learning_rate": 6.001610194928464e-07,
      "loss": 0.0039,
      "reward": -0.1873398944735527,
      "reward_std": 0.41095768846571445,
      "rewards/cosine_scaled_reward": -0.13533661514520645,
      "rewards/format_reward": 0.0833333358168602,
      "step": 260
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.14914285714285713,
      "grad_norm": 0.14360758662223816,
      "kl": 0.05487060546875,
      "learning_rate": 5.97037808470444e-07,
      "loss": 0.0002,
      "reward": -0.652643047273159,
      "reward_std": 0.15021498315036297,
      "rewards/cosine_scaled_reward": -0.3263215161859989,
      "rewards/format_reward": 0.0,
      "step": 261
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3575.9583740234375,
      "epoch": 0.14971428571428572,
      "grad_norm": 0.16494929790496826,
      "kl": 0.05615234375,
      "learning_rate": 5.939123048916173e-07,
      "loss": 0.0024,
      "reward": -0.3416307270526886,
      "reward_std": 0.6418765336275101,
      "rewards/cosine_scaled_reward": -0.2333153709769249,
      "rewards/format_reward": 0.125,
      "step": 262
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.15028571428571427,
      "grad_norm": 0.16168098151683807,
      "kl": 0.068115234375,
      "learning_rate": 5.907846610890011e-07,
      "loss": 0.0003,
      "reward": -0.47282079607248306,
      "reward_std": 0.17454615235328674,
      "rewards/cosine_scaled_reward": -0.23641039803624153,
      "rewards/format_reward": 0.0,
      "step": 263
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.15085714285714286,
      "grad_norm": 0.17631350457668304,
      "kl": 0.065673828125,
      "learning_rate": 5.87655029499542e-07,
      "loss": 0.0003,
      "reward": -0.388326043728739,
      "reward_std": 0.3333720900118351,
      "rewards/cosine_scaled_reward": -0.19416302209720016,
      "rewards/format_reward": 0.0,
      "step": 264
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3424.2083740234375,
      "epoch": 0.15142857142857144,
      "grad_norm": 0.19585396349430084,
      "kl": 0.070556640625,
      "learning_rate": 5.845235626570683e-07,
      "loss": 0.057,
      "reward": -0.058371422346681356,
      "reward_std": 0.7044498100876808,
      "rewards/cosine_scaled_reward": -0.15418571420013905,
      "rewards/format_reward": 0.25,
      "step": 265
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3448.2083740234375,
      "epoch": 0.152,
      "grad_norm": 0.15063370764255524,
      "kl": 0.061279296875,
      "learning_rate": 5.813904131848564e-07,
      "loss": 0.0162,
      "reward": -0.41802845895290375,
      "reward_std": 0.29004839062690735,
      "rewards/cosine_scaled_reward": -0.31318090111017227,
      "rewards/format_reward": 0.2083333432674408,
      "step": 266
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2616.7500915527344,
      "epoch": 0.15257142857142858,
      "grad_norm": 0.27274543046951294,
      "kl": 0.1700439453125,
      "learning_rate": 5.78255733788191e-07,
      "loss": 0.0972,
      "reward": -0.0867045596241951,
      "reward_std": 0.6231234706938267,
      "rewards/cosine_scaled_reward": -0.2516856137663126,
      "rewards/format_reward": 0.4166666716337204,
      "step": 267
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2716.875030517578,
      "epoch": 0.15314285714285714,
      "grad_norm": 0.29783037304878235,
      "kl": 0.05804443359375,
      "learning_rate": 5.751196772469237e-07,
      "loss": 0.0792,
      "reward": 0.11836675927042961,
      "reward_std": 0.5197452530264854,
      "rewards/cosine_scaled_reward": -0.12831661850214005,
      "rewards/format_reward": 0.375,
      "step": 268
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.15371428571428572,
      "grad_norm": 0.16401593387126923,
      "kl": 0.053466796875,
      "learning_rate": 5.71982396408026e-07,
      "loss": 0.0002,
      "reward": -0.5338539285585284,
      "reward_std": 0.09281859919428825,
      "rewards/cosine_scaled_reward": -0.2669269605539739,
      "rewards/format_reward": 0.0,
      "step": 269
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2750.0416870117188,
      "epoch": 0.15428571428571428,
      "grad_norm": 0.2087016999721527,
      "kl": 0.07025146484375,
      "learning_rate": 5.688440441781398e-07,
      "loss": -0.0195,
      "reward": 0.8994439318776131,
      "reward_std": 0.3248627707362175,
      "rewards/cosine_scaled_reward": 0.22055532410740852,
      "rewards/format_reward": 0.4583333432674408,
      "step": 270
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3442.2083740234375,
      "epoch": 0.15485714285714286,
      "grad_norm": 0.21406613290309906,
      "kl": 0.08978271484375,
      "learning_rate": 5.657047735161255e-07,
      "loss": 0.0622,
      "reward": -0.3841980127617717,
      "reward_std": 0.4643286466598511,
      "rewards/cosine_scaled_reward": -0.2754323445260525,
      "rewards/format_reward": 0.1666666716337204,
      "step": 271
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2941.0833435058594,
      "epoch": 0.15542857142857142,
      "grad_norm": 0.30245617032051086,
      "kl": 0.053955078125,
      "learning_rate": 5.625647374256061e-07,
      "loss": 0.1571,
      "reward": -0.32398582744644955,
      "reward_std": 0.7070891531184316,
      "rewards/cosine_scaled_reward": -0.18282624892890453,
      "rewards/format_reward": 0.0416666679084301,
      "step": 272
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3562.5833740234375,
      "epoch": 0.156,
      "grad_norm": 0.2364557981491089,
      "kl": 0.08203125,
      "learning_rate": 5.594240889475106e-07,
      "loss": 0.0027,
      "reward": -0.5826010927557945,
      "reward_std": 0.2715669982135296,
      "rewards/cosine_scaled_reward": -0.29130054637789726,
      "rewards/format_reward": 0.0,
      "step": 273
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3581.625,
      "epoch": 0.15657142857142858,
      "grad_norm": 0.15081574022769928,
      "kl": 0.0504150390625,
      "learning_rate": 5.562829811526154e-07,
      "loss": 0.0015,
      "reward": -0.5524590611457825,
      "reward_std": 0.2481101332232356,
      "rewards/cosine_scaled_reward": -0.29706286266446114,
      "rewards/format_reward": 0.0416666679084301,
      "step": 274
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3234.7500610351562,
      "epoch": 0.15714285714285714,
      "grad_norm": 0.152909055352211,
      "kl": 0.05181884765625,
      "learning_rate": 5.531415671340826e-07,
      "loss": 0.1079,
      "reward": -0.4288216568529606,
      "reward_std": 0.3044823817908764,
      "rewards/cosine_scaled_reward": -0.29774416238069534,
      "rewards/format_reward": 0.1666666679084301,
      "step": 275
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2614.8333587646484,
      "epoch": 0.15771428571428572,
      "grad_norm": 0.23602361977100372,
      "kl": 0.05712890625,
      "learning_rate": 5.5e-07,
      "loss": -0.0228,
      "reward": 0.7694906368851662,
      "reward_std": 1.0586964339017868,
      "rewards/cosine_scaled_reward": 0.11391199752688408,
      "rewards/format_reward": 0.5416666865348816,
      "step": 276
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3485.3333740234375,
      "epoch": 0.15828571428571428,
      "grad_norm": 0.1500847041606903,
      "kl": 0.0594482421875,
      "learning_rate": 5.468584328659172e-07,
      "loss": 0.0609,
      "reward": -0.2842923626303673,
      "reward_std": 0.6474459134042263,
      "rewards/cosine_scaled_reward": -0.18381286598742008,
      "rewards/format_reward": 0.0833333358168602,
      "step": 277
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3267.916748046875,
      "epoch": 0.15885714285714286,
      "grad_norm": 0.15897172689437866,
      "kl": 0.0579833984375,
      "learning_rate": 5.437170188473847e-07,
      "loss": 0.0396,
      "reward": -0.12434278428554535,
      "reward_std": 0.8399608321487904,
      "rewards/cosine_scaled_reward": -0.18717139214277267,
      "rewards/format_reward": 0.25,
      "step": 278
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3488.3333740234375,
      "epoch": 0.15942857142857142,
      "grad_norm": 0.23828986287117004,
      "kl": 0.0665283203125,
      "learning_rate": 5.405759110524894e-07,
      "loss": 0.0221,
      "reward": 0.005788974463939667,
      "reward_std": 0.5269475094974041,
      "rewards/cosine_scaled_reward": -0.08043883368372917,
      "rewards/format_reward": 0.1666666679084301,
      "step": 279
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2997.0833740234375,
      "epoch": 0.16,
      "grad_norm": 0.21499881148338318,
      "kl": 0.04998779296875,
      "learning_rate": 5.37435262574394e-07,
      "loss": 0.0863,
      "reward": 0.2994392313994467,
      "reward_std": 0.5071588382124901,
      "rewards/cosine_scaled_reward": -0.05861368915066123,
      "rewards/format_reward": 0.4166666865348816,
      "step": 280
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2781.791748046875,
      "epoch": 0.16057142857142856,
      "grad_norm": 0.3882482647895813,
      "kl": 0.0693359375,
      "learning_rate": 5.342952264838747e-07,
      "loss": 0.2062,
      "reward": 0.33036062493920326,
      "reward_std": 0.5028475373983383,
      "rewards/cosine_scaled_reward": -0.02231965959072113,
      "rewards/format_reward": 0.3750000037252903,
      "step": 281
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3425.9166870117188,
      "epoch": 0.16114285714285714,
      "grad_norm": 0.15672989189624786,
      "kl": 0.05950927734375,
      "learning_rate": 5.311559558218603e-07,
      "loss": 0.0345,
      "reward": 0.03875131532549858,
      "reward_std": 0.6108375154435635,
      "rewards/cosine_scaled_reward": -0.06395764835178852,
      "rewards/format_reward": 0.1666666716337204,
      "step": 282
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3141.7916870117188,
      "epoch": 0.16171428571428573,
      "grad_norm": 0.24408189952373505,
      "kl": 0.06939697265625,
      "learning_rate": 5.28017603591974e-07,
      "loss": 0.1504,
      "reward": -0.5937509834766388,
      "reward_std": 0.25269549153745174,
      "rewards/cosine_scaled_reward": -0.3802088275551796,
      "rewards/format_reward": 0.1666666716337204,
      "step": 283
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2734.750045776367,
      "epoch": 0.16228571428571428,
      "grad_norm": 0.3049091100692749,
      "kl": 0.06610107421875,
      "learning_rate": 5.248803227530763e-07,
      "loss": 0.1375,
      "reward": 0.437591552734375,
      "reward_std": 0.3647673763334751,
      "rewards/cosine_scaled_reward": 0.07296241726726294,
      "rewards/format_reward": 0.2916666679084301,
      "step": 284
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.16285714285714287,
      "grad_norm": 0.18352235853672028,
      "kl": 0.07177734375,
      "learning_rate": 5.21744266211809e-07,
      "loss": 0.0003,
      "reward": -0.5737389139831066,
      "reward_std": 0.1539888195693493,
      "rewards/cosine_scaled_reward": -0.2868694569915533,
      "rewards/format_reward": 0.0,
      "step": 285
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3354.1250610351562,
      "epoch": 0.16342857142857142,
      "grad_norm": 0.2930234968662262,
      "kl": 0.0418701171875,
      "learning_rate": 5.186095868151436e-07,
      "loss": 0.1145,
      "reward": -0.2171124890446663,
      "reward_std": 0.7043565139174461,
      "rewards/cosine_scaled_reward": -0.1918895822018385,
      "rewards/format_reward": 0.1666666716337204,
      "step": 286
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3343.3750610351562,
      "epoch": 0.164,
      "grad_norm": 0.16565990447998047,
      "kl": 0.05560302734375,
      "learning_rate": 5.154764373429315e-07,
      "loss": 0.0172,
      "reward": -0.2116563618183136,
      "reward_std": 0.49668116495013237,
      "rewards/cosine_scaled_reward": -0.25166154466569424,
      "rewards/format_reward": 0.2916666679084301,
      "step": 287
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3557.3750610351562,
      "epoch": 0.16457142857142856,
      "grad_norm": 0.14387962222099304,
      "kl": 0.06109619140625,
      "learning_rate": 5.123449705004581e-07,
      "loss": 0.0158,
      "reward": -0.7241450697183609,
      "reward_std": 0.3718634694814682,
      "rewards/cosine_scaled_reward": -0.40373920649290085,
      "rewards/format_reward": 0.0833333358168602,
      "step": 288
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.16514285714285715,
      "grad_norm": 0.1413969099521637,
      "kl": 0.06207275390625,
      "learning_rate": 5.09215338910999e-07,
      "loss": 0.0002,
      "reward": -0.49317351169884205,
      "reward_std": 0.11529102176427841,
      "rewards/cosine_scaled_reward": -0.24658673349767923,
      "rewards/format_reward": 0.0,
      "step": 289
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3508.0416870117188,
      "epoch": 0.1657142857142857,
      "grad_norm": 0.16138982772827148,
      "kl": 0.09033203125,
      "learning_rate": 5.060876951083828e-07,
      "loss": 0.0174,
      "reward": -0.308999203145504,
      "reward_std": 0.2485736645758152,
      "rewards/cosine_scaled_reward": -0.1961662769317627,
      "rewards/format_reward": 0.0833333358168602,
      "step": 290
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.1662857142857143,
      "grad_norm": 0.6058784127235413,
      "kl": 0.123291015625,
      "learning_rate": 5.02962191529556e-07,
      "loss": 0.0005,
      "reward": -0.41384733468294144,
      "reward_std": 0.3912395089864731,
      "rewards/cosine_scaled_reward": -0.20692366734147072,
      "rewards/format_reward": 0.0,
      "step": 291
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3541.9166870117188,
      "epoch": 0.16685714285714287,
      "grad_norm": 0.1335904747247696,
      "kl": 0.0677490234375,
      "learning_rate": 4.998389805071536e-07,
      "loss": 0.0181,
      "reward": -0.5098201781511307,
      "reward_std": 0.5878917332738638,
      "rewards/cosine_scaled_reward": -0.29657675698399544,
      "rewards/format_reward": 0.0833333358168602,
      "step": 292
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3127.541748046875,
      "epoch": 0.16742857142857143,
      "grad_norm": 0.19865606725215912,
      "kl": 0.07427978515625,
      "learning_rate": 4.967182142620745e-07,
      "loss": 0.133,
      "reward": 0.4655437543988228,
      "reward_std": 1.0733295306563377,
      "rewards/cosine_scaled_reward": 0.04527186043560505,
      "rewards/format_reward": 0.3750000074505806,
      "step": 293
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3428.7083740234375,
      "epoch": 0.168,
      "grad_norm": 0.17549487948417664,
      "kl": 0.0758056640625,
      "learning_rate": 4.93600044896063e-07,
      "loss": 0.0327,
      "reward": -0.4640468708239496,
      "reward_std": 0.46714043989777565,
      "rewards/cosine_scaled_reward": -0.2736901044845581,
      "rewards/format_reward": 0.0833333358168602,
      "step": 294
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3540.375,
      "epoch": 0.16857142857142857,
      "grad_norm": 0.17328360676765442,
      "kl": 0.065399169921875,
      "learning_rate": 4.904846243842949e-07,
      "loss": 0.0115,
      "reward": -0.21514444053173065,
      "reward_std": 0.19357968866825104,
      "rewards/cosine_scaled_reward": -0.14923888631165028,
      "rewards/format_reward": 0.0833333358168602,
      "step": 295
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3369.0416870117188,
      "epoch": 0.16914285714285715,
      "grad_norm": 0.26708632707595825,
      "kl": 0.075439453125,
      "learning_rate": 4.873721045679706e-07,
      "loss": 0.0851,
      "reward": -0.5817175265401602,
      "reward_std": 0.30820662481710315,
      "rewards/cosine_scaled_reward": -0.35335876047611237,
      "rewards/format_reward": 0.125,
      "step": 296
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.1697142857142857,
      "grad_norm": 0.14792469143867493,
      "kl": 0.0599365234375,
      "learning_rate": 4.842626371469149e-07,
      "loss": 0.0002,
      "reward": -0.22593041975051165,
      "reward_std": 0.6252758391201496,
      "rewards/cosine_scaled_reward": -0.15463187545537949,
      "rewards/format_reward": 0.0833333358168602,
      "step": 297
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3517.875,
      "epoch": 0.1702857142857143,
      "grad_norm": 0.17152658104896545,
      "kl": 0.06243896484375,
      "learning_rate": 4.811563736721829e-07,
      "loss": 0.0254,
      "reward": -0.43114787340164185,
      "reward_std": 0.5913300961256027,
      "rewards/cosine_scaled_reward": -0.2572406269609928,
      "rewards/format_reward": 0.0833333358168602,
      "step": 298
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3537.6666870117188,
      "epoch": 0.17085714285714285,
      "grad_norm": 0.16120056807994843,
      "kl": 0.0836181640625,
      "learning_rate": 4.780534655386743e-07,
      "loss": 0.028,
      "reward": -0.09139684634283185,
      "reward_std": 0.4525203984230757,
      "rewards/cosine_scaled_reward": -0.06653175503015518,
      "rewards/format_reward": 0.0416666679084301,
      "step": 299
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3293.8333740234375,
      "epoch": 0.17142857142857143,
      "grad_norm": 0.20001089572906494,
      "kl": 0.0758056640625,
      "learning_rate": 4.749540639777539e-07,
      "loss": 0.0915,
      "reward": -0.4263968728482723,
      "reward_std": 0.5731936097145081,
      "rewards/cosine_scaled_reward": -0.2965317729394883,
      "rewards/format_reward": 0.1666666679084301,
      "step": 300
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3485.8333740234375,
      "epoch": 0.172,
      "grad_norm": 0.1472051441669464,
      "kl": 0.05682373046875,
      "learning_rate": 4.7185832004988133e-07,
      "loss": 0.0364,
      "reward": -0.38035064563155174,
      "reward_std": 0.330845657736063,
      "rewards/cosine_scaled_reward": -0.2526753172278404,
      "rewards/format_reward": 0.125,
      "step": 301
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2926.625,
      "epoch": 0.17257142857142857,
      "grad_norm": 0.17583613097667694,
      "kl": 0.0887451171875,
      "learning_rate": 4.68766384637248e-07,
      "loss": 0.0105,
      "reward": 0.0778973288834095,
      "reward_std": 0.35484304931014776,
      "rewards/cosine_scaled_reward": -0.08605133555829525,
      "rewards/format_reward": 0.25,
      "step": 302
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3239.125,
      "epoch": 0.17314285714285715,
      "grad_norm": 0.22526656091213226,
      "kl": 0.0518798828125,
      "learning_rate": 4.656784084364238e-07,
      "loss": -0.0222,
      "reward": 0.1513216346502304,
      "reward_std": 0.2562936134636402,
      "rewards/cosine_scaled_reward": -0.04933919757604599,
      "rewards/format_reward": 0.25,
      "step": 303
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3572.5833740234375,
      "epoch": 0.1737142857142857,
      "grad_norm": 0.1886323094367981,
      "kl": 0.0618896484375,
      "learning_rate": 4.6259454195101267e-07,
      "loss": 0.0052,
      "reward": -0.6805952340364456,
      "reward_std": 0.24055694788694382,
      "rewards/cosine_scaled_reward": -0.361130952835083,
      "rewards/format_reward": 0.0416666679084301,
      "step": 304
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3441.7083740234375,
      "epoch": 0.1742857142857143,
      "grad_norm": 0.33955273032188416,
      "kl": 0.16815185546875,
      "learning_rate": 4.59514935484316e-07,
      "loss": 0.0358,
      "reward": -0.30776484683156013,
      "reward_std": 0.5681664384901524,
      "rewards/cosine_scaled_reward": -0.2788824327290058,
      "rewards/format_reward": 0.2500000111758709,
      "step": 305
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2418.8333435058594,
      "epoch": 0.17485714285714285,
      "grad_norm": 0.2670734226703644,
      "kl": 0.156005859375,
      "learning_rate": 4.5643973913200837e-07,
      "loss": 0.2228,
      "reward": 0.33872567117214203,
      "reward_std": 0.8834966868162155,
      "rewards/cosine_scaled_reward": -0.03897051140666008,
      "rewards/format_reward": 0.4166666716337204,
      "step": 306
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.17542857142857143,
      "grad_norm": 0.1793275624513626,
      "kl": 0.04876708984375,
      "learning_rate": 4.5336910277482155e-07,
      "loss": 0.0002,
      "reward": -0.7549219056963921,
      "reward_std": 0.10495427064597607,
      "rewards/cosine_scaled_reward": -0.37746093794703484,
      "rewards/format_reward": 0.0,
      "step": 307
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2914.875,
      "epoch": 0.176,
      "grad_norm": 0.16645893454551697,
      "kl": 0.0543212890625,
      "learning_rate": 4.503031760712397e-07,
      "loss": -0.0321,
      "reward": 0.7862786203622818,
      "reward_std": 0.4856582581996918,
      "rewards/cosine_scaled_reward": 0.1431393399834633,
      "rewards/format_reward": 0.5,
      "step": 308
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.17657142857142857,
      "grad_norm": 0.1512133628129959,
      "kl": 0.06036376953125,
      "learning_rate": 4.4724210845020494e-07,
      "loss": 0.0002,
      "reward": -0.5015019737184048,
      "reward_std": 0.18755416758358479,
      "rewards/cosine_scaled_reward": -0.25075098779052496,
      "rewards/format_reward": 0.0,
      "step": 309
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.17714285714285713,
      "grad_norm": 0.1387099325656891,
      "kl": 0.0472412109375,
      "learning_rate": 4.441860491038345e-07,
      "loss": 0.0002,
      "reward": -0.7481062561273575,
      "reward_std": 0.17375030741095543,
      "rewards/cosine_scaled_reward": -0.37405312806367874,
      "rewards/format_reward": 0.0,
      "step": 310
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3508.6666870117188,
      "epoch": 0.1777142857142857,
      "grad_norm": 0.15475568175315857,
      "kl": 0.05950927734375,
      "learning_rate": 4.4113514698014953e-07,
      "loss": 0.0364,
      "reward": -0.0750106479972601,
      "reward_std": 0.5773040950298309,
      "rewards/cosine_scaled_reward": -0.12083865702152252,
      "rewards/format_reward": 0.1666666679084301,
      "step": 311
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.1782857142857143,
      "grad_norm": 0.1638628989458084,
      "kl": 0.084228515625,
      "learning_rate": 4.3808955077581546e-07,
      "loss": 0.0003,
      "reward": -0.566048726439476,
      "reward_std": 0.20274320989847183,
      "rewards/cosine_scaled_reward": -0.2830243557691574,
      "rewards/format_reward": 0.0,
      "step": 312
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2778.333335876465,
      "epoch": 0.17885714285714285,
      "grad_norm": 0.2880570888519287,
      "kl": 0.05511474609375,
      "learning_rate": 4.350494089288943e-07,
      "loss": -0.0187,
      "reward": 0.12140727043151855,
      "reward_std": 0.5429899077862501,
      "rewards/cosine_scaled_reward": -0.06429639086127281,
      "rewards/format_reward": 0.25,
      "step": 313
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3279.2916870117188,
      "epoch": 0.17942857142857144,
      "grad_norm": 0.2017597258090973,
      "kl": 0.06085205078125,
      "learning_rate": 4.3201486961161093e-07,
      "loss": 0.0592,
      "reward": -0.1488794982433319,
      "reward_std": 0.4336536340415478,
      "rewards/cosine_scaled_reward": -0.17860640585422516,
      "rewards/format_reward": 0.2083333432674408,
      "step": 314
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3494.0,
      "epoch": 0.18,
      "grad_norm": 0.1533028930425644,
      "kl": 0.06219482421875,
      "learning_rate": 4.2898608072313045e-07,
      "loss": 0.017,
      "reward": -0.6007172577083111,
      "reward_std": 0.23412644118070602,
      "rewards/cosine_scaled_reward": -0.3420252874493599,
      "rewards/format_reward": 0.0833333358168602,
      "step": 315
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3558.875,
      "epoch": 0.18057142857142858,
      "grad_norm": 0.1556541472673416,
      "kl": 0.06085205078125,
      "learning_rate": 4.2596318988235037e-07,
      "loss": 0.0149,
      "reward": -0.5843707770109177,
      "reward_std": 0.26823178865015507,
      "rewards/cosine_scaled_reward": -0.31301872432231903,
      "rewards/format_reward": 0.0416666679084301,
      "step": 316
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3410.5000610351562,
      "epoch": 0.18114285714285713,
      "grad_norm": 0.30716508626937866,
      "kl": 0.0994873046875,
      "learning_rate": 4.2294634442070553e-07,
      "loss": 0.0859,
      "reward": -0.14803938567638397,
      "reward_std": 0.9198054745793343,
      "rewards/cosine_scaled_reward": -0.15735302958637476,
      "rewards/format_reward": 0.1666666716337204,
      "step": 317
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3349.4166870117188,
      "epoch": 0.18171428571428572,
      "grad_norm": 0.16601794958114624,
      "kl": 0.06646728515625,
      "learning_rate": 4.1993569137498776e-07,
      "loss": 0.0599,
      "reward": -0.34297938644886017,
      "reward_std": 0.745935533195734,
      "rewards/cosine_scaled_reward": -0.2548230402171612,
      "rewards/format_reward": 0.1666666679084301,
      "step": 318
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3559.7083740234375,
      "epoch": 0.18228571428571427,
      "grad_norm": 0.18891586363315582,
      "kl": 0.056884765625,
      "learning_rate": 4.1693137748017915e-07,
      "loss": 0.0138,
      "reward": -0.45162222534418106,
      "reward_std": 0.5305716693401337,
      "rewards/cosine_scaled_reward": -0.26747776684351265,
      "rewards/format_reward": 0.0833333358168602,
      "step": 319
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3171.75,
      "epoch": 0.18285714285714286,
      "grad_norm": 0.14246085286140442,
      "kl": 0.055816650390625,
      "learning_rate": 4.1393354916230005e-07,
      "loss": -0.044,
      "reward": 0.17042651772499084,
      "reward_std": 0.24396423622965813,
      "rewards/cosine_scaled_reward": -0.03978674113750458,
      "rewards/format_reward": 0.25,
      "step": 320
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.18342857142857144,
      "grad_norm": 0.16783183813095093,
      "kl": 0.07696533203125,
      "learning_rate": 4.1094235253127374e-07,
      "loss": 0.0003,
      "reward": -0.5164474323391914,
      "reward_std": 0.16403124667704105,
      "rewards/cosine_scaled_reward": -0.25822371058166027,
      "rewards/format_reward": 0.0,
      "step": 321
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.184,
      "grad_norm": 0.13648982346057892,
      "kl": 0.05108642578125,
      "learning_rate": 4.079579333738039e-07,
      "loss": 0.0002,
      "reward": -0.3795193247497082,
      "reward_std": 0.49757753871381283,
      "rewards/cosine_scaled_reward": -0.2105929981917143,
      "rewards/format_reward": 0.0416666679084301,
      "step": 322
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.18457142857142858,
      "grad_norm": 0.14335662126541138,
      "kl": 0.05615234375,
      "learning_rate": 4.0498043714627006e-07,
      "loss": 0.0002,
      "reward": -0.499904029071331,
      "reward_std": 0.40482280403375626,
      "rewards/cosine_scaled_reward": -0.2499520145356655,
      "rewards/format_reward": 0.0,
      "step": 323
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3524.3333740234375,
      "epoch": 0.18514285714285714,
      "grad_norm": 0.1648027002811432,
      "kl": 0.05670166015625,
      "learning_rate": 4.020100089676376e-07,
      "loss": 0.0232,
      "reward": -0.22896066308021545,
      "reward_std": 0.8391835503280163,
      "rewards/cosine_scaled_reward": -0.17698032222688198,
      "rewards/format_reward": 0.1250000037252903,
      "step": 324
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3188.75,
      "epoch": 0.18571428571428572,
      "grad_norm": 0.30512839555740356,
      "kl": 0.0650634765625,
      "learning_rate": 3.9904679361238526e-07,
      "loss": 0.1489,
      "reward": -0.47016472124960274,
      "reward_std": 0.2999837603420019,
      "rewards/cosine_scaled_reward": -0.31841566786170006,
      "rewards/format_reward": 0.1666666716337204,
      "step": 325
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2809.458335876465,
      "epoch": 0.18628571428571428,
      "grad_norm": 0.502647876739502,
      "kl": 0.085205078125,
      "learning_rate": 3.9609093550344907e-07,
      "loss": 0.1019,
      "reward": -0.14913707971572876,
      "reward_std": 0.15667949616909027,
      "rewards/cosine_scaled_reward": -0.19956854358315468,
      "rewards/format_reward": 0.25,
      "step": 326
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.18685714285714286,
      "grad_norm": 0.15863609313964844,
      "kl": 0.058349609375,
      "learning_rate": 3.931425787051832e-07,
      "loss": 0.0002,
      "reward": -0.6654567644000053,
      "reward_std": 0.10124669130891562,
      "rewards/cosine_scaled_reward": -0.33272838313132524,
      "rewards/format_reward": 0.0,
      "step": 327
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3116.291717529297,
      "epoch": 0.18742857142857142,
      "grad_norm": 0.3484804332256317,
      "kl": 0.09393310546875,
      "learning_rate": 3.902018669163384e-07,
      "loss": 0.1693,
      "reward": -0.508688498288393,
      "reward_std": 0.37443263083696365,
      "rewards/cosine_scaled_reward": -0.37934424355626106,
      "rewards/format_reward": 0.2500000074505806,
      "step": 328
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3456.5,
      "epoch": 0.188,
      "grad_norm": 0.20245981216430664,
      "kl": 0.05224609375,
      "learning_rate": 3.872689434630585e-07,
      "loss": 0.0368,
      "reward": -0.6058483272790909,
      "reward_std": 0.33944240026175976,
      "rewards/cosine_scaled_reward": -0.36542417109012604,
      "rewards/format_reward": 0.125,
      "step": 329
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3498.375,
      "epoch": 0.18857142857142858,
      "grad_norm": 0.15449191629886627,
      "kl": 0.05731201171875,
      "learning_rate": 3.843439512918949e-07,
      "loss": 0.0257,
      "reward": -0.7321149110794067,
      "reward_std": 0.17255932837724686,
      "rewards/cosine_scaled_reward": -0.40772412717342377,
      "rewards/format_reward": 0.0833333358168602,
      "step": 330
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3003.0416870117188,
      "epoch": 0.18914285714285714,
      "grad_norm": 0.1611107587814331,
      "kl": 0.07293701171875,
      "learning_rate": 3.8142703296283953e-07,
      "loss": -0.0101,
      "reward": 0.07398295402526855,
      "reward_std": 0.43574428372085094,
      "rewards/cosine_scaled_reward": -0.10884186625480652,
      "rewards/format_reward": 0.2916666679084301,
      "step": 331
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3474.4166870117188,
      "epoch": 0.18971428571428572,
      "grad_norm": 0.1410750448703766,
      "kl": 0.0550537109375,
      "learning_rate": 3.785183306423767e-07,
      "loss": 0.0602,
      "reward": -0.3694458119571209,
      "reward_std": 0.21785772289149463,
      "rewards/cosine_scaled_reward": -0.2055562452878803,
      "rewards/format_reward": 0.0416666679084301,
      "step": 332
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3202.625,
      "epoch": 0.19028571428571428,
      "grad_norm": 0.2355586290359497,
      "kl": 0.069091796875,
      "learning_rate": 3.7561798609655373e-07,
      "loss": 0.1097,
      "reward": -0.24160130321979523,
      "reward_std": 0.5544586107134819,
      "rewards/cosine_scaled_reward": -0.20413399254903197,
      "rewards/format_reward": 0.1666666716337204,
      "step": 333
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.19085714285714286,
      "grad_norm": 0.1450648009777069,
      "kl": 0.0533447265625,
      "learning_rate": 3.72726140684072e-07,
      "loss": 0.0002,
      "reward": -0.5695072785019875,
      "reward_std": 0.1969633586704731,
      "rewards/cosine_scaled_reward": -0.284753642976284,
      "rewards/format_reward": 0.0,
      "step": 334
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3126.4583740234375,
      "epoch": 0.19142857142857142,
      "grad_norm": 0.2336735874414444,
      "kl": 0.058837890625,
      "learning_rate": 3.6984293534939737e-07,
      "loss": 0.1574,
      "reward": 0.08427366428077221,
      "reward_std": 0.6991970017552376,
      "rewards/cosine_scaled_reward": -0.12452983297407627,
      "rewards/format_reward": 0.3333333395421505,
      "step": 335
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3331.25,
      "epoch": 0.192,
      "grad_norm": 0.15116646885871887,
      "kl": 0.05267333984375,
      "learning_rate": 3.6696851061588994e-07,
      "loss": 0.0781,
      "reward": -0.09475219994783401,
      "reward_std": 0.5551117174327374,
      "rewards/cosine_scaled_reward": -0.13070943020284176,
      "rewards/format_reward": 0.1666666716337204,
      "step": 336
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3105.75,
      "epoch": 0.19257142857142856,
      "grad_norm": 0.3058691918849945,
      "kl": 0.05621337890625,
      "learning_rate": 3.641030065789562e-07,
      "loss": 0.1142,
      "reward": 0.07824870198965073,
      "reward_std": 0.3396912142634392,
      "rewards/cosine_scaled_reward": -0.06504232808947563,
      "rewards/format_reward": 0.2083333432674408,
      "step": 337
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2867.8333740234375,
      "epoch": 0.19314285714285714,
      "grad_norm": 0.2673719525337219,
      "kl": 0.070068359375,
      "learning_rate": 3.612465628992203e-07,
      "loss": 0.0946,
      "reward": -0.12158103799447417,
      "reward_std": 0.7564724162220955,
      "rewards/cosine_scaled_reward": -0.20662385318428278,
      "rewards/format_reward": 0.2916666679084301,
      "step": 338
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.19371428571428573,
      "grad_norm": 0.19467608630657196,
      "kl": 0.0643310546875,
      "learning_rate": 3.5839931879571725e-07,
      "loss": 0.0003,
      "reward": -0.6717243790626526,
      "reward_std": 0.19412844628095627,
      "rewards/cosine_scaled_reward": -0.3358621746301651,
      "rewards/format_reward": 0.0,
      "step": 339
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2584.916717529297,
      "epoch": 0.19428571428571428,
      "grad_norm": 0.22200323641300201,
      "kl": 0.0587158203125,
      "learning_rate": 3.555614130391079e-07,
      "loss": 0.111,
      "reward": -0.06582178920507431,
      "reward_std": 0.39784812927246094,
      "rewards/cosine_scaled_reward": -0.26207755878567696,
      "rewards/format_reward": 0.4583333432674408,
      "step": 340
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3294.4583740234375,
      "epoch": 0.19485714285714287,
      "grad_norm": 0.19403870403766632,
      "kl": 0.063079833984375,
      "learning_rate": 3.5273298394491515e-07,
      "loss": 0.1289,
      "reward": -0.3598571866750717,
      "reward_std": 0.6343272626399994,
      "rewards/cosine_scaled_reward": -0.26326192915439606,
      "rewards/format_reward": 0.1666666716337204,
      "step": 341
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3205.5,
      "epoch": 0.19542857142857142,
      "grad_norm": 0.26232898235321045,
      "kl": 0.06964111328125,
      "learning_rate": 3.4991416936678276e-07,
      "loss": 0.1101,
      "reward": -0.16020217537879944,
      "reward_std": 0.5374426189810038,
      "rewards/cosine_scaled_reward": -0.16343442350625992,
      "rewards/format_reward": 0.1666666716337204,
      "step": 342
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3284.3333740234375,
      "epoch": 0.196,
      "grad_norm": 0.1842142790555954,
      "kl": 0.08135986328125,
      "learning_rate": 3.471051066897562e-07,
      "loss": 0.0697,
      "reward": -0.3795701861381531,
      "reward_std": 0.38023433461785316,
      "rewards/cosine_scaled_reward": -0.2939517763443291,
      "rewards/format_reward": 0.2083333395421505,
      "step": 343
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3421.7916870117188,
      "epoch": 0.19657142857142856,
      "grad_norm": 0.17274045944213867,
      "kl": 0.05596923828125,
      "learning_rate": 3.4430593282358777e-07,
      "loss": 0.0354,
      "reward": -0.0571894496679306,
      "reward_std": 0.5195095278322697,
      "rewards/cosine_scaled_reward": -0.11192803084850311,
      "rewards/format_reward": 0.1666666716337204,
      "step": 344
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2716.4583740234375,
      "epoch": 0.19714285714285715,
      "grad_norm": 0.3007984757423401,
      "kl": 0.07659912109375,
      "learning_rate": 3.4151678419606233e-07,
      "loss": 0.071,
      "reward": 0.6559105515480042,
      "reward_std": 0.8571383208036423,
      "rewards/cosine_scaled_reward": 0.09878861159086227,
      "rewards/format_reward": 0.4583333395421505,
      "step": 345
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.1977142857142857,
      "grad_norm": 0.16842573881149292,
      "kl": 0.056671142578125,
      "learning_rate": 3.387377967463493e-07,
      "loss": 0.0002,
      "reward": -0.4263014793395996,
      "reward_std": 0.184997221454978,
      "rewards/cosine_scaled_reward": -0.23398405965417624,
      "rewards/format_reward": 0.0416666679084301,
      "step": 346
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.1982857142857143,
      "grad_norm": 0.17800471186637878,
      "kl": 0.05902099609375,
      "learning_rate": 3.359691059183761e-07,
      "loss": 0.0002,
      "reward": -0.43403539061546326,
      "reward_std": 0.5467140041291714,
      "rewards/cosine_scaled_reward": -0.23785103484988213,
      "rewards/format_reward": 0.0416666679084301,
      "step": 347
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3102.75,
      "epoch": 0.19885714285714284,
      "grad_norm": 0.24414986371994019,
      "kl": 0.075439453125,
      "learning_rate": 3.3321084665422803e-07,
      "loss": 0.0301,
      "reward": -0.3239244371652603,
      "reward_std": 0.2671828344464302,
      "rewards/cosine_scaled_reward": -0.28696221858263016,
      "rewards/format_reward": 0.25,
      "step": 348
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3567.25,
      "epoch": 0.19942857142857143,
      "grad_norm": 0.1622619926929474,
      "kl": 0.060791015625,
      "learning_rate": 3.3046315338757026e-07,
      "loss": 0.0097,
      "reward": -0.4406840596348047,
      "reward_std": 0.6408121213316917,
      "rewards/cosine_scaled_reward": -0.24117536237463355,
      "rewards/format_reward": 0.0416666679084301,
      "step": 349
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3151.7083740234375,
      "epoch": 0.2,
      "grad_norm": 0.298951655626297,
      "kl": 0.064453125,
      "learning_rate": 3.2772616003709616e-07,
      "loss": 0.1421,
      "reward": 0.4877912010997534,
      "reward_std": 0.32348718494176865,
      "rewards/cosine_scaled_reward": 0.11889559030532837,
      "rewards/format_reward": 0.2500000074505806,
      "step": 350
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3521.25,
      "epoch": 0.20057142857142857,
      "grad_norm": 0.14912287890911102,
      "kl": 0.06524658203125,
      "learning_rate": 3.250000000000001e-07,
      "loss": 0.0257,
      "reward": -0.12844760343432426,
      "reward_std": 0.9212811887264252,
      "rewards/cosine_scaled_reward": -0.12672380730509758,
      "rewards/format_reward": 0.1250000037252903,
      "step": 351
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2800.875030517578,
      "epoch": 0.20114285714285715,
      "grad_norm": 0.2803266644477844,
      "kl": 0.058837890625,
      "learning_rate": 3.222848061454764e-07,
      "loss": 0.1305,
      "reward": 0.3184015303850174,
      "reward_std": 0.8227717503905296,
      "rewards/cosine_scaled_reward": -0.049132585525512695,
      "rewards/format_reward": 0.4166666716337204,
      "step": 352
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3494.6666870117188,
      "epoch": 0.2017142857142857,
      "grad_norm": 0.13803733885288239,
      "kl": 0.06195068359375,
      "learning_rate": 3.195807108082429e-07,
      "loss": 0.0364,
      "reward": -0.3796301782131195,
      "reward_std": 0.3742275796830654,
      "rewards/cosine_scaled_reward": -0.23148176074028015,
      "rewards/format_reward": 0.0833333358168602,
      "step": 353
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3425.7083740234375,
      "epoch": 0.2022857142857143,
      "grad_norm": 0.14631807804107666,
      "kl": 0.053466796875,
      "learning_rate": 3.168878457820915e-07,
      "loss": 0.0276,
      "reward": -0.3159081442281604,
      "reward_std": 0.33451056107878685,
      "rewards/cosine_scaled_reward": -0.24128740979358554,
      "rewards/format_reward": 0.1666666716337204,
      "step": 354
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2898.375,
      "epoch": 0.20285714285714285,
      "grad_norm": 0.18180125951766968,
      "kl": 0.06915283203125,
      "learning_rate": 3.142063423134644e-07,
      "loss": 0.0021,
      "reward": 0.19555070251226425,
      "reward_std": 0.2758802194148302,
      "rewards/cosine_scaled_reward": -0.08972465991973877,
      "rewards/format_reward": 0.375,
      "step": 355
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3481.7500610351562,
      "epoch": 0.20342857142857143,
      "grad_norm": 0.18392685055732727,
      "kl": 0.05126953125,
      "learning_rate": 3.115363310950578e-07,
      "loss": 0.0401,
      "reward": -0.02315519005060196,
      "reward_std": 0.9286646004766226,
      "rewards/cosine_scaled_reward": -0.11574426759034395,
      "rewards/format_reward": 0.2083333358168602,
      "step": 356
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3214.5833740234375,
      "epoch": 0.204,
      "grad_norm": 0.14562208950519562,
      "kl": 0.056884765625,
      "learning_rate": 3.0887794225945143e-07,
      "loss": -0.0449,
      "reward": -0.42156238853931427,
      "reward_std": 0.2620309516787529,
      "rewards/cosine_scaled_reward": -0.33578120172023773,
      "rewards/format_reward": 0.25,
      "step": 357
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3117.2083740234375,
      "epoch": 0.20457142857142857,
      "grad_norm": 0.17316700518131256,
      "kl": 0.06512451171875,
      "learning_rate": 3.062313053727671e-07,
      "loss": -0.0734,
      "reward": 0.12614673376083374,
      "reward_std": 0.474518496543169,
      "rewards/cosine_scaled_reward": -0.08275998383760452,
      "rewards/format_reward": 0.2916666679084301,
      "step": 358
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3429.8333740234375,
      "epoch": 0.20514285714285715,
      "grad_norm": 0.23892873525619507,
      "kl": 0.072265625,
      "learning_rate": 3.0359654942835247e-07,
      "loss": 0.0686,
      "reward": -0.5188581272959709,
      "reward_std": 0.37433599308133125,
      "rewards/cosine_scaled_reward": -0.32192906737327576,
      "rewards/format_reward": 0.1250000037252903,
      "step": 359
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3555.5416870117188,
      "epoch": 0.2057142857142857,
      "grad_norm": 0.12861715257167816,
      "kl": 0.0487060546875,
      "learning_rate": 3.0097380284049523e-07,
      "loss": 0.0163,
      "reward": -0.41498488187789917,
      "reward_std": 0.29756803065538406,
      "rewards/cosine_scaled_reward": -0.22832577303051949,
      "rewards/format_reward": 0.0416666679084301,
      "step": 360
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3466.9583740234375,
      "epoch": 0.2062857142857143,
      "grad_norm": 0.19341325759887695,
      "kl": 0.06231689453125,
      "learning_rate": 2.9836319343816397e-07,
      "loss": 0.0662,
      "reward": -0.44232044741511345,
      "reward_std": 0.38765473291277885,
      "rewards/cosine_scaled_reward": -0.24199354834854603,
      "rewards/format_reward": 0.0416666679084301,
      "step": 361
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.20685714285714285,
      "grad_norm": 0.13692274689674377,
      "kl": 0.04925537109375,
      "learning_rate": 2.9576484845877793e-07,
      "loss": 0.0002,
      "reward": -0.47414352430496365,
      "reward_std": 0.39991648495197296,
      "rewards/cosine_scaled_reward": -0.27873843535780907,
      "rewards/format_reward": 0.0833333358168602,
      "step": 362
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3512.75,
      "epoch": 0.20742857142857143,
      "grad_norm": 0.2301596701145172,
      "kl": 0.1016845703125,
      "learning_rate": 2.931788945420058e-07,
      "loss": 0.0428,
      "reward": -0.3924715518951416,
      "reward_std": 0.4111638516187668,
      "rewards/cosine_scaled_reward": -0.2170691154897213,
      "rewards/format_reward": 0.0416666679084301,
      "step": 363
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3534.4583740234375,
      "epoch": 0.208,
      "grad_norm": 0.13970957696437836,
      "kl": 0.0601806640625,
      "learning_rate": 2.9060545772359305e-07,
      "loss": 0.03,
      "reward": -0.5889072120189667,
      "reward_std": 0.3112984448671341,
      "rewards/cosine_scaled_reward": -0.31528693437576294,
      "rewards/format_reward": 0.0416666679084301,
      "step": 364
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3378.7083740234375,
      "epoch": 0.20857142857142857,
      "grad_norm": 0.22361378371715546,
      "kl": 0.06732177734375,
      "learning_rate": 2.8804466342921987e-07,
      "loss": 0.0314,
      "reward": -0.39835788309574127,
      "reward_std": 0.3161539016291499,
      "rewards/cosine_scaled_reward": -0.30334562435746193,
      "rewards/format_reward": 0.2083333432674408,
      "step": 365
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3558.625,
      "epoch": 0.20914285714285713,
      "grad_norm": 0.1580444574356079,
      "kl": 0.06201171875,
      "learning_rate": 2.854966364683872e-07,
      "loss": 0.0145,
      "reward": -0.4920261278748512,
      "reward_std": 0.28505855053663254,
      "rewards/cosine_scaled_reward": -0.2876797318458557,
      "rewards/format_reward": 0.0833333358168602,
      "step": 366
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2983.9583435058594,
      "epoch": 0.20971428571428571,
      "grad_norm": 0.16354723274707794,
      "kl": 0.06024169921875,
      "learning_rate": 2.829615010283344e-07,
      "loss": -0.0499,
      "reward": -0.0459320992231369,
      "reward_std": 0.4094668244943023,
      "rewards/cosine_scaled_reward": -0.14796602725982666,
      "rewards/format_reward": 0.25,
      "step": 367
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2961.625,
      "epoch": 0.2102857142857143,
      "grad_norm": 0.1965043544769287,
      "kl": 0.06488037109375,
      "learning_rate": 2.8043938066798645e-07,
      "loss": -0.0578,
      "reward": -0.2956502139568329,
      "reward_std": 0.37837161868810654,
      "rewards/cosine_scaled_reward": -0.27282512187957764,
      "rewards/format_reward": 0.25,
      "step": 368
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3540.8333740234375,
      "epoch": 0.21085714285714285,
      "grad_norm": 0.1523246318101883,
      "kl": 0.062255859375,
      "learning_rate": 2.7793039831193133e-07,
      "loss": 0.0014,
      "reward": -0.48877737671136856,
      "reward_std": 0.40059489756822586,
      "rewards/cosine_scaled_reward": -0.2860553557984531,
      "rewards/format_reward": 0.0833333358168602,
      "step": 369
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3571.0833740234375,
      "epoch": 0.21142857142857144,
      "grad_norm": 0.14142955839633942,
      "kl": 0.063232421875,
      "learning_rate": 2.7543467624442956e-07,
      "loss": 0.005,
      "reward": -0.4047308452427387,
      "reward_std": 0.5568485893309116,
      "rewards/cosine_scaled_reward": -0.2440320923924446,
      "rewards/format_reward": 0.0833333358168602,
      "step": 370
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3227.9583740234375,
      "epoch": 0.212,
      "grad_norm": 0.2218811810016632,
      "kl": 0.0560302734375,
      "learning_rate": 2.729523361034538e-07,
      "loss": 0.0255,
      "reward": 0.3210463747382164,
      "reward_std": 0.4591142237186432,
      "rewards/cosine_scaled_reward": 0.0355231873691082,
      "rewards/format_reward": 0.25,
      "step": 371
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.21257142857142858,
      "grad_norm": 0.15657925605773926,
      "kl": 0.06109619140625,
      "learning_rate": 2.7048349887476037e-07,
      "loss": 0.0002,
      "reward": -0.44183915108442307,
      "reward_std": 0.3711034543812275,
      "rewards/cosine_scaled_reward": -0.22091957554221153,
      "rewards/format_reward": 0.0,
      "step": 372
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3060.75,
      "epoch": 0.21314285714285713,
      "grad_norm": 0.26730263233184814,
      "kl": 0.059326171875,
      "learning_rate": 2.6802828488599294e-07,
      "loss": -0.0419,
      "reward": 0.1390758454799652,
      "reward_std": 0.4297582358121872,
      "rewards/cosine_scaled_reward": -0.07629543542861938,
      "rewards/format_reward": 0.2916666679084301,
      "step": 373
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.21371428571428572,
      "grad_norm": 0.18330542743206024,
      "kl": 0.0675048828125,
      "learning_rate": 2.655868138008171e-07,
      "loss": 0.0003,
      "reward": -0.6386789381504059,
      "reward_std": 0.2875619940459728,
      "rewards/cosine_scaled_reward": -0.31933946907520294,
      "rewards/format_reward": 0.0,
      "step": 374
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3327.7083740234375,
      "epoch": 0.21428571428571427,
      "grad_norm": 0.29869186878204346,
      "kl": 0.0648193359375,
      "learning_rate": 2.631592046130896e-07,
      "loss": 0.0974,
      "reward": 0.09292280673980713,
      "reward_std": 0.8725230973213911,
      "rewards/cosine_scaled_reward": -0.036871928721666336,
      "rewards/format_reward": 0.1666666716337204,
      "step": 375
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3528.3333740234375,
      "epoch": 0.21485714285714286,
      "grad_norm": 0.18041333556175232,
      "kl": 0.05859375,
      "learning_rate": 2.6074557564105724e-07,
      "loss": 0.0234,
      "reward": -0.5773191377520561,
      "reward_std": 0.5778478160500526,
      "rewards/cosine_scaled_reward": -0.33032623305916786,
      "rewards/format_reward": 0.0833333358168602,
      "step": 376
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3195.7083740234375,
      "epoch": 0.21542857142857144,
      "grad_norm": 0.2103518396615982,
      "kl": 0.07080078125,
      "learning_rate": 2.583460445215911e-07,
      "loss": 0.071,
      "reward": 0.030612453818321228,
      "reward_std": 0.5441110543906689,
      "rewards/cosine_scaled_reward": -0.10969378054141998,
      "rewards/format_reward": 0.2500000111758709,
      "step": 377
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3409.625,
      "epoch": 0.216,
      "grad_norm": 0.17528486251831055,
      "kl": 0.0775146484375,
      "learning_rate": 2.5596072820445254e-07,
      "loss": 0.043,
      "reward": -0.5896594896912575,
      "reward_std": 0.3001709654927254,
      "rewards/cosine_scaled_reward": -0.35732975602149963,
      "rewards/format_reward": 0.125,
      "step": 378
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3403.875,
      "epoch": 0.21657142857142858,
      "grad_norm": 0.15156695246696472,
      "kl": 0.066162109375,
      "learning_rate": 2.5358974294659373e-07,
      "loss": 0.0245,
      "reward": -0.30837604124099016,
      "reward_std": 0.2952228505164385,
      "rewards/cosine_scaled_reward": -0.2583546922542155,
      "rewards/format_reward": 0.2083333432674408,
      "step": 379
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3511.0000610351562,
      "epoch": 0.21714285714285714,
      "grad_norm": 0.14815667271614075,
      "kl": 0.06707763671875,
      "learning_rate": 2.512332043064913e-07,
      "loss": 0.036,
      "reward": -0.433868832886219,
      "reward_std": 0.5863572731614113,
      "rewards/cosine_scaled_reward": -0.2586010806262493,
      "rewards/format_reward": 0.0833333358168602,
      "step": 380
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3383.7083740234375,
      "epoch": 0.21771428571428572,
      "grad_norm": 0.1917722523212433,
      "kl": 0.07476806640625,
      "learning_rate": 2.488912271385139e-07,
      "loss": 0.0918,
      "reward": -0.5000769086182117,
      "reward_std": 0.5215454287827015,
      "rewards/cosine_scaled_reward": -0.3125384636223316,
      "rewards/format_reward": 0.1250000037252903,
      "step": 381
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.21828571428571428,
      "grad_norm": 0.15971945226192474,
      "kl": 0.0699462890625,
      "learning_rate": 2.465639255873246e-07,
      "loss": 0.0003,
      "reward": -0.7036767303943634,
      "reward_std": 0.18040955439209938,
      "rewards/cosine_scaled_reward": -0.3518383651971817,
      "rewards/format_reward": 0.0,
      "step": 382
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3277.2083740234375,
      "epoch": 0.21885714285714286,
      "grad_norm": 0.19748570024967194,
      "kl": 0.0924072265625,
      "learning_rate": 2.4425141308231765e-07,
      "loss": 0.0122,
      "reward": 0.2866915538907051,
      "reward_std": 0.7322729602456093,
      "rewards/cosine_scaled_reward": 0.018345749005675316,
      "rewards/format_reward": 0.2500000111758709,
      "step": 383
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.21942857142857142,
      "grad_norm": 0.14422692358493805,
      "kl": 0.058837890625,
      "learning_rate": 2.4195380233209006e-07,
      "loss": 0.0002,
      "reward": -0.4746512770652771,
      "reward_std": 0.40125712379813194,
      "rewards/cosine_scaled_reward": -0.25815898180007935,
      "rewards/format_reward": 0.0416666679084301,
      "step": 384
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2883.25,
      "epoch": 0.22,
      "grad_norm": 0.20794402062892914,
      "kl": 0.06744384765625,
      "learning_rate": 2.3967120531894857e-07,
      "loss": 0.0445,
      "reward": -0.0814894586801529,
      "reward_std": 0.5334790125489235,
      "rewards/cosine_scaled_reward": -0.16574472934007645,
      "rewards/format_reward": 0.25,
      "step": 385
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3516.2916870117188,
      "epoch": 0.22057142857142858,
      "grad_norm": 0.17907118797302246,
      "kl": 0.07537841796875,
      "learning_rate": 2.374037332934512e-07,
      "loss": 0.0077,
      "reward": -0.5339493155479431,
      "reward_std": 0.1955426223576069,
      "rewards/cosine_scaled_reward": -0.32947464287281036,
      "rewards/format_reward": 0.125,
      "step": 386
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2310.0416870117188,
      "epoch": 0.22114285714285714,
      "grad_norm": 0.26615405082702637,
      "kl": 0.078369140625,
      "learning_rate": 2.3515149676898552e-07,
      "loss": 0.1631,
      "reward": 0.8421094790101051,
      "reward_std": 0.3642051964998245,
      "rewards/cosine_scaled_reward": 0.19188803806900978,
      "rewards/format_reward": 0.4583333432674408,
      "step": 387
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2787.041717529297,
      "epoch": 0.22171428571428572,
      "grad_norm": 0.171456977725029,
      "kl": 0.06024169921875,
      "learning_rate": 2.3291460551638237e-07,
      "loss": 0.0421,
      "reward": 0.8226055353879929,
      "reward_std": 0.5110184028744698,
      "rewards/cosine_scaled_reward": 0.11963611841201782,
      "rewards/format_reward": 0.5833333358168602,
      "step": 388
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2454.875045776367,
      "epoch": 0.22228571428571428,
      "grad_norm": 0.20929378271102905,
      "kl": 0.06768798828125,
      "learning_rate": 2.306931685585657e-07,
      "loss": 0.1073,
      "reward": 0.9306686669588089,
      "reward_std": 1.0127770230174065,
      "rewards/cosine_scaled_reward": 0.19450097158551216,
      "rewards/format_reward": 0.5416666716337204,
      "step": 389
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3351.0416870117188,
      "epoch": 0.22285714285714286,
      "grad_norm": 0.28745704889297485,
      "kl": 0.06829833984375,
      "learning_rate": 2.2848729416523859e-07,
      "loss": 0.0874,
      "reward": -0.35296558355912566,
      "reward_std": 0.5742789804935455,
      "rewards/cosine_scaled_reward": -0.23898280411958694,
      "rewards/format_reward": 0.1250000037252903,
      "step": 390
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3579.2083740234375,
      "epoch": 0.22342857142857142,
      "grad_norm": 0.1473505049943924,
      "kl": 0.06964111328125,
      "learning_rate": 2.2629708984760706e-07,
      "loss": 0.003,
      "reward": -0.3405249584466219,
      "reward_std": 0.20318820234388113,
      "rewards/cosine_scaled_reward": -0.19109581504017115,
      "rewards/format_reward": 0.0416666679084301,
      "step": 391
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3410.5416870117188,
      "epoch": 0.224,
      "grad_norm": 0.16982409358024597,
      "kl": 0.06280517578125,
      "learning_rate": 2.2412266235313973e-07,
      "loss": 0.0332,
      "reward": -0.26890936493873596,
      "reward_std": 0.5837244279682636,
      "rewards/cosine_scaled_reward": -0.21778801828622818,
      "rewards/format_reward": 0.1666666716337204,
      "step": 392
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.22457142857142856,
      "grad_norm": 0.14082053303718567,
      "kl": 0.05859375,
      "learning_rate": 2.2196411766036487e-07,
      "loss": 0.0002,
      "reward": -0.4981812983751297,
      "reward_std": 0.3227638751268387,
      "rewards/cosine_scaled_reward": -0.26992398500442505,
      "rewards/format_reward": 0.0416666679084301,
      "step": 393
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2889.25,
      "epoch": 0.22514285714285714,
      "grad_norm": 0.20773468911647797,
      "kl": 0.07421875,
      "learning_rate": 2.1982156097370557e-07,
      "loss": 0.0717,
      "reward": 0.11960902251303196,
      "reward_std": 0.47472497448325157,
      "rewards/cosine_scaled_reward": -0.06519548781216145,
      "rewards/format_reward": 0.25,
      "step": 394
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.2257142857142857,
      "grad_norm": 0.18604017794132233,
      "kl": 0.05572509765625,
      "learning_rate": 2.1769509671835223e-07,
      "loss": 0.0002,
      "reward": -0.43610429018735886,
      "reward_std": 0.18078533560037613,
      "rewards/cosine_scaled_reward": -0.21805214136838913,
      "rewards/format_reward": 0.0,
      "step": 395
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3554.6666870117188,
      "epoch": 0.22628571428571428,
      "grad_norm": 0.12925174832344055,
      "kl": 0.05133056640625,
      "learning_rate": 2.1558482853517253e-07,
      "loss": 0.0112,
      "reward": -0.23759066313505173,
      "reward_std": 0.5703822672367096,
      "rewards/cosine_scaled_reward": -0.22296201065182686,
      "rewards/format_reward": 0.2083333395421505,
      "step": 396
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3202.0000610351562,
      "epoch": 0.22685714285714287,
      "grad_norm": 0.18819594383239746,
      "kl": 0.07476806640625,
      "learning_rate": 2.134908592756607e-07,
      "loss": 0.0672,
      "reward": -0.029868334531784058,
      "reward_std": 0.6436930447816849,
      "rewards/cosine_scaled_reward": -0.16076749563217163,
      "rewards/format_reward": 0.2916666716337204,
      "step": 397
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3310.25,
      "epoch": 0.22742857142857142,
      "grad_norm": 0.2150135636329651,
      "kl": 0.0743408203125,
      "learning_rate": 2.1141329099692406e-07,
      "loss": 0.1091,
      "reward": -0.34711187332868576,
      "reward_std": 0.4893313031643629,
      "rewards/cosine_scaled_reward": -0.2568892817944288,
      "rewards/format_reward": 0.1666666716337204,
      "step": 398
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.228,
      "grad_norm": 0.1621653437614441,
      "kl": 0.072998046875,
      "learning_rate": 2.0935222495670968e-07,
      "loss": 0.0003,
      "reward": -0.6336401849985123,
      "reward_std": 0.2006698123877868,
      "rewards/cosine_scaled_reward": -0.31682008877396584,
      "rewards/format_reward": 0.0,
      "step": 399
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2075.4583435058594,
      "epoch": 0.22857142857142856,
      "grad_norm": 0.3685378432273865,
      "kl": 0.0650634765625,
      "learning_rate": 2.0730776160846853e-07,
      "loss": 0.1218,
      "reward": 0.7177584320306778,
      "reward_std": 0.5401098467409611,
      "rewards/cosine_scaled_reward": 0.06721250712871552,
      "rewards/format_reward": 0.5833333358168602,
      "step": 400
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3315.5,
      "epoch": 0.22914285714285715,
      "grad_norm": 0.15908077359199524,
      "kl": 0.07159423828125,
      "learning_rate": 2.0528000059645995e-07,
      "loss": 0.0029,
      "reward": 0.04471557028591633,
      "reward_std": 0.7973760291934013,
      "rewards/cosine_scaled_reward": -0.12347555533051491,
      "rewards/format_reward": 0.291666679084301,
      "step": 401
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2788.4166717529297,
      "epoch": 0.2297142857142857,
      "grad_norm": 0.27960649132728577,
      "kl": 0.05523681640625,
      "learning_rate": 2.032690407508949e-07,
      "loss": -0.0295,
      "reward": -0.3924320638179779,
      "reward_std": 0.39799761585891247,
      "rewards/cosine_scaled_reward": -0.21704937238246202,
      "rewards/format_reward": 0.0416666679084301,
      "step": 402
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2993.6666870117188,
      "epoch": 0.2302857142857143,
      "grad_norm": 0.3006753623485565,
      "kl": 0.066650390625,
      "learning_rate": 2.0127498008311922e-07,
      "loss": 0.1707,
      "reward": -0.2186871642479673,
      "reward_std": 0.7662175893783569,
      "rewards/cosine_scaled_reward": -0.27601024881005287,
      "rewards/format_reward": 0.3333333395421505,
      "step": 403
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3025.5416870117188,
      "epoch": 0.23085714285714284,
      "grad_norm": 0.17790178954601288,
      "kl": 0.07391357421875,
      "learning_rate": 1.9929791578083655e-07,
      "loss": -0.0457,
      "reward": -0.3540494963526726,
      "reward_std": 0.2286781333386898,
      "rewards/cosine_scaled_reward": -0.3020247519016266,
      "rewards/format_reward": 0.25,
      "step": 404
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3297.7083740234375,
      "epoch": 0.23142857142857143,
      "grad_norm": 0.21704569458961487,
      "kl": 0.06781005859375,
      "learning_rate": 1.9733794420337213e-07,
      "loss": 0.033,
      "reward": 0.39337925612926483,
      "reward_std": 0.5153894275426865,
      "rewards/cosine_scaled_reward": 0.050856344401836395,
      "rewards/format_reward": 0.2916666679084301,
      "step": 405
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.232,
      "grad_norm": 0.18073974549770355,
      "kl": 0.0775146484375,
      "learning_rate": 1.9539516087697517e-07,
      "loss": 0.0003,
      "reward": -0.72935850918293,
      "reward_std": 0.17922993749380112,
      "rewards/cosine_scaled_reward": -0.364679254591465,
      "rewards/format_reward": 0.0,
      "step": 406
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3288.375,
      "epoch": 0.23257142857142857,
      "grad_norm": 0.16751199960708618,
      "kl": 0.05029296875,
      "learning_rate": 1.934696604901642e-07,
      "loss": 0.0725,
      "reward": -0.014177265577018261,
      "reward_std": 0.5347921415232122,
      "rewards/cosine_scaled_reward": -0.09042196115478873,
      "rewards/format_reward": 0.1666666716337204,
      "step": 407
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3183.0833740234375,
      "epoch": 0.23314285714285715,
      "grad_norm": 0.289958655834198,
      "kl": 0.08087158203125,
      "learning_rate": 1.915615368891117e-07,
      "loss": 0.0824,
      "reward": -0.2237246111035347,
      "reward_std": 0.6425449755042791,
      "rewards/cosine_scaled_reward": -0.2368623074144125,
      "rewards/format_reward": 0.2500000111758709,
      "step": 408
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.2337142857142857,
      "grad_norm": 0.15507179498672485,
      "kl": 0.0733642578125,
      "learning_rate": 1.8967088307307e-07,
      "loss": 0.0003,
      "reward": -0.8279737383127213,
      "reward_std": 0.19148441590368748,
      "rewards/cosine_scaled_reward": -0.4139868766069412,
      "rewards/format_reward": 0.0,
      "step": 409
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3522.5833740234375,
      "epoch": 0.2342857142857143,
      "grad_norm": 0.15713585913181305,
      "kl": 0.0584716796875,
      "learning_rate": 1.8779779118983867e-07,
      "loss": 0.037,
      "reward": -0.6865072301588953,
      "reward_std": 0.4425274422392249,
      "rewards/cosine_scaled_reward": -0.3640869613736868,
      "rewards/format_reward": 0.0416666679084301,
      "step": 410
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.23485714285714285,
      "grad_norm": 0.14535216987133026,
      "kl": 0.05908203125,
      "learning_rate": 1.8594235253127372e-07,
      "loss": 0.0002,
      "reward": -0.46431963704526424,
      "reward_std": 0.40429612435400486,
      "rewards/cosine_scaled_reward": -0.2529931543394923,
      "rewards/format_reward": 0.0416666679084301,
      "step": 411
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.23542857142857143,
      "grad_norm": 0.14524948596954346,
      "kl": 0.059814453125,
      "learning_rate": 1.8410465752883758e-07,
      "loss": 0.0002,
      "reward": -0.7351335138082504,
      "reward_std": 0.2500329352915287,
      "rewards/cosine_scaled_reward": -0.3675667643547058,
      "rewards/format_reward": 0.0,
      "step": 412
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3240.375,
      "epoch": 0.236,
      "grad_norm": 0.1661958247423172,
      "kl": 0.04791259765625,
      "learning_rate": 1.822847957491922e-07,
      "loss": 0.049,
      "reward": -0.06640298664569855,
      "reward_std": 0.7313026990741491,
      "rewards/cosine_scaled_reward": -0.15820149332284927,
      "rewards/format_reward": 0.2500000111758709,
      "step": 413
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3034.4166870117188,
      "epoch": 0.23657142857142857,
      "grad_norm": 0.1746898889541626,
      "kl": 0.05645751953125,
      "learning_rate": 1.804828558898332e-07,
      "loss": 0.1069,
      "reward": -0.08215373568236828,
      "reward_std": 0.4830262325704098,
      "rewards/cosine_scaled_reward": -0.1452435301616788,
      "rewards/format_reward": 0.2083333432674408,
      "step": 414
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3484.5416870117188,
      "epoch": 0.23714285714285716,
      "grad_norm": 0.14372943341732025,
      "kl": 0.06488037109375,
      "learning_rate": 1.7869892577476722e-07,
      "loss": 0.0328,
      "reward": -0.33930344693362713,
      "reward_std": 0.47646061331033707,
      "rewards/cosine_scaled_reward": -0.27381839603185654,
      "rewards/format_reward": 0.2083333358168602,
      "step": 415
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3189.4583435058594,
      "epoch": 0.2377142857142857,
      "grad_norm": 0.304644376039505,
      "kl": 0.05975341796875,
      "learning_rate": 1.7693309235023127e-07,
      "loss": 0.1251,
      "reward": -0.2368946084752679,
      "reward_std": 0.1720826616510749,
      "rewards/cosine_scaled_reward": -0.20178064471110702,
      "rewards/format_reward": 0.1666666716337204,
      "step": 416
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3047.5,
      "epoch": 0.2382857142857143,
      "grad_norm": 0.13361294567584991,
      "kl": 0.061279296875,
      "learning_rate": 1.7518544168045524e-07,
      "loss": 0.06,
      "reward": 0.06008124351501465,
      "reward_std": 0.7015728913247585,
      "rewards/cosine_scaled_reward": -0.11579271033406258,
      "rewards/format_reward": 0.2916666679084301,
      "step": 417
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3249.7083740234375,
      "epoch": 0.23885714285714285,
      "grad_norm": 0.2082819938659668,
      "kl": 0.0965576171875,
      "learning_rate": 1.7345605894346726e-07,
      "loss": 0.0679,
      "reward": -0.3713177442550659,
      "reward_std": 0.2855104599148035,
      "rewards/cosine_scaled_reward": -0.28982554003596306,
      "rewards/format_reward": 0.2083333432674408,
      "step": 418
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3562.25,
      "epoch": 0.23942857142857144,
      "grad_norm": 0.2277977466583252,
      "kl": 0.060546875,
      "learning_rate": 1.7174502842694212e-07,
      "loss": 0.0129,
      "reward": -0.4065123088657856,
      "reward_std": 0.28984224516898394,
      "rewards/cosine_scaled_reward": -0.2449228223413229,
      "rewards/format_reward": 0.0833333358168602,
      "step": 419
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2630.916717529297,
      "epoch": 0.24,
      "grad_norm": 0.1967594474554062,
      "kl": 0.0904541015625,
      "learning_rate": 1.7005243352409333e-07,
      "loss": 0.0107,
      "reward": 0.7067751418799162,
      "reward_std": 0.7725067064166069,
      "rewards/cosine_scaled_reward": 0.040887586772441864,
      "rewards/format_reward": 0.6250000149011612,
      "step": 420
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3460.7916870117188,
      "epoch": 0.24057142857142857,
      "grad_norm": 0.16090837121009827,
      "kl": 0.06695556640625,
      "learning_rate": 1.6837835672960831e-07,
      "loss": 0.066,
      "reward": -0.6657570600509644,
      "reward_std": 0.2832261845469475,
      "rewards/cosine_scaled_reward": -0.3537118658423424,
      "rewards/format_reward": 0.0416666679084301,
      "step": 421
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.24114285714285713,
      "grad_norm": 0.13034385442733765,
      "kl": 0.0517578125,
      "learning_rate": 1.6672287963562852e-07,
      "loss": 0.0002,
      "reward": -0.6389777138829231,
      "reward_std": 0.23646372184157372,
      "rewards/cosine_scaled_reward": -0.31948884204030037,
      "rewards/format_reward": 0.0,
      "step": 422
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3240.7083740234375,
      "epoch": 0.24171428571428571,
      "grad_norm": 0.21142347157001495,
      "kl": 0.0682373046875,
      "learning_rate": 1.6508608292777203e-07,
      "loss": 0.0498,
      "reward": 0.22329876571893692,
      "reward_std": 0.7350533455610275,
      "rewards/cosine_scaled_reward": -0.013350628316402435,
      "rewards/format_reward": 0.2500000074505806,
      "step": 423
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3533.0416870117188,
      "epoch": 0.2422857142857143,
      "grad_norm": 0.15932375192642212,
      "kl": 0.07061767578125,
      "learning_rate": 1.6346804638120098e-07,
      "loss": 0.0236,
      "reward": -0.3861430063843727,
      "reward_std": 0.6368795782327652,
      "rewards/cosine_scaled_reward": -0.23473817482590675,
      "rewards/format_reward": 0.0833333358168602,
      "step": 424
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.24285714285714285,
      "grad_norm": 0.15948008000850677,
      "kl": 0.0645751953125,
      "learning_rate": 1.6186884885673413e-07,
      "loss": 0.0003,
      "reward": -0.6987170726060867,
      "reward_std": 0.15650003217160702,
      "rewards/cosine_scaled_reward": -0.34935855120420456,
      "rewards/format_reward": 0.0,
      "step": 425
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3515.0,
      "epoch": 0.24342857142857144,
      "grad_norm": 0.1669153869152069,
      "kl": 0.0499267578125,
      "learning_rate": 1.6028856829700258e-07,
      "loss": 0.0158,
      "reward": -0.10988223552703857,
      "reward_std": 0.5193835459649563,
      "rewards/cosine_scaled_reward": -0.13827446848154068,
      "rewards/format_reward": 0.1666666716337204,
      "step": 426
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.244,
      "grad_norm": 0.1475888043642044,
      "kl": 0.0557861328125,
      "learning_rate": 1.5872728172265146e-07,
      "loss": 0.0002,
      "reward": -0.35475102812051773,
      "reward_std": 0.5787421241402626,
      "rewards/cosine_scaled_reward": -0.23987551219761372,
      "rewards/format_reward": 0.1250000037252903,
      "step": 427
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3454.8333740234375,
      "epoch": 0.24457142857142858,
      "grad_norm": 0.15619736909866333,
      "kl": 0.06072998046875,
      "learning_rate": 1.5718506522858572e-07,
      "loss": 0.0498,
      "reward": -0.4008786417543888,
      "reward_std": 0.5806032568216324,
      "rewards/cosine_scaled_reward": -0.26293932646512985,
      "rewards/format_reward": 0.1250000037252903,
      "step": 428
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3573.5416870117188,
      "epoch": 0.24514285714285713,
      "grad_norm": 0.16329319775104523,
      "kl": 0.0654296875,
      "learning_rate": 1.5566199398026147e-07,
      "loss": 0.0062,
      "reward": -0.6754651218652725,
      "reward_std": 0.2705872841179371,
      "rewards/cosine_scaled_reward": -0.35856589674949646,
      "rewards/format_reward": 0.0416666679084301,
      "step": 429
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.24571428571428572,
      "grad_norm": 0.153706356883049,
      "kl": 0.06573486328125,
      "learning_rate": 1.5415814221002265e-07,
      "loss": 0.0003,
      "reward": -0.6211144402623177,
      "reward_std": 0.2846235502511263,
      "rewards/cosine_scaled_reward": -0.331390555948019,
      "rewards/format_reward": 0.0416666679084301,
      "step": 430
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.24628571428571427,
      "grad_norm": 0.14263786375522614,
      "kl": 0.05303955078125,
      "learning_rate": 1.5267358321348285e-07,
      "loss": 0.0002,
      "reward": -0.5838388651609421,
      "reward_std": 0.299509858712554,
      "rewards/cosine_scaled_reward": -0.29191943258047104,
      "rewards/format_reward": 0.0,
      "step": 431
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3518.5,
      "epoch": 0.24685714285714286,
      "grad_norm": 0.1427145153284073,
      "kl": 0.06005859375,
      "learning_rate": 1.5120838934595337e-07,
      "loss": -0.0016,
      "reward": -0.22047018259763718,
      "reward_std": 0.6944977939128876,
      "rewards/cosine_scaled_reward": -0.1727350875735283,
      "rewards/format_reward": 0.1250000037252903,
      "step": 432
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3022.4166870117188,
      "epoch": 0.24742857142857144,
      "grad_norm": 0.1529431790113449,
      "kl": 0.06256103515625,
      "learning_rate": 1.4976263201891613e-07,
      "loss": -0.0272,
      "reward": 0.18959258869290352,
      "reward_std": 0.38590487092733383,
      "rewards/cosine_scaled_reward": -0.030203720554709435,
      "rewards/format_reward": 0.25,
      "step": 433
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3537.1666870117188,
      "epoch": 0.248,
      "grad_norm": 0.17672474682331085,
      "kl": 0.06597900390625,
      "learning_rate": 1.483363816965435e-07,
      "loss": 0.0283,
      "reward": -0.37754696048796177,
      "reward_std": 0.4972882941365242,
      "rewards/cosine_scaled_reward": -0.20960681792348623,
      "rewards/format_reward": 0.0416666679084301,
      "step": 434
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3184.625,
      "epoch": 0.24857142857142858,
      "grad_norm": 0.1457490473985672,
      "kl": 0.05413818359375,
      "learning_rate": 1.469297078922642e-07,
      "loss": -0.0538,
      "reward": 0.18912622332572937,
      "reward_std": 0.17255538050085306,
      "rewards/cosine_scaled_reward": -0.03043687343597412,
      "rewards/format_reward": 0.25,
      "step": 435
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3228.875,
      "epoch": 0.24914285714285714,
      "grad_norm": 0.26985064148902893,
      "kl": 0.05609130859375,
      "learning_rate": 1.4554267916537495e-07,
      "loss": 0.0741,
      "reward": -0.41794606298208237,
      "reward_std": 0.19913825299590826,
      "rewards/cosine_scaled_reward": -0.3131397217512131,
      "rewards/format_reward": 0.2083333395421505,
      "step": 436
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2429.8333435058594,
      "epoch": 0.24971428571428572,
      "grad_norm": 0.33341628313064575,
      "kl": 0.0677490234375,
      "learning_rate": 1.4417536311769885e-07,
      "loss": 0.1697,
      "reward": 0.001805141568183899,
      "reward_std": 0.49792607966810465,
      "rewards/cosine_scaled_reward": -0.22826410830020905,
      "rewards/format_reward": 0.4583333432674408,
      "step": 437
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.2502857142857143,
      "grad_norm": 0.14212451875209808,
      "kl": 0.0621337890625,
      "learning_rate": 1.4282782639029128e-07,
      "loss": 0.0002,
      "reward": -0.698416069149971,
      "reward_std": 0.15093004517257214,
      "rewards/cosine_scaled_reward": -0.3492080345749855,
      "rewards/format_reward": 0.0,
      "step": 438
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3442.1250610351562,
      "epoch": 0.25085714285714283,
      "grad_norm": 0.16146516799926758,
      "kl": 0.0662841796875,
      "learning_rate": 1.4150013466019114e-07,
      "loss": 0.0241,
      "reward": -0.31549201533198357,
      "reward_std": 0.500336742028594,
      "rewards/cosine_scaled_reward": -0.22024601697921753,
      "rewards/format_reward": 0.1250000037252903,
      "step": 439
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3325.9583740234375,
      "epoch": 0.25142857142857145,
      "grad_norm": 0.21158044040203094,
      "kl": 0.082275390625,
      "learning_rate": 1.4019235263722034e-07,
      "loss": 0.1078,
      "reward": -0.5876736007630825,
      "reward_std": 0.39598204009234905,
      "rewards/cosine_scaled_reward": -0.377170130610466,
      "rewards/format_reward": 0.1666666716337204,
      "step": 440
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.252,
      "grad_norm": 0.15019887685775757,
      "kl": 0.059326171875,
      "learning_rate": 1.3890454406082956e-07,
      "loss": 0.0002,
      "reward": -0.6195696294307709,
      "reward_std": 0.2191491797566414,
      "rewards/cosine_scaled_reward": -0.30978482961654663,
      "rewards/format_reward": 0.0,
      "step": 441
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3102.7083740234375,
      "epoch": 0.25257142857142856,
      "grad_norm": 0.1601240336894989,
      "kl": 0.05877685546875,
      "learning_rate": 1.3763677169699217e-07,
      "loss": 0.166,
      "reward": -0.11922668665647507,
      "reward_std": 0.5712239369750023,
      "rewards/cosine_scaled_reward": -0.18461336567997932,
      "rewards/format_reward": 0.2500000074505806,
      "step": 442
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3339.2083740234375,
      "epoch": 0.25314285714285717,
      "grad_norm": 0.2830808758735657,
      "kl": 0.0550537109375,
      "learning_rate": 1.3638909733514452e-07,
      "loss": 0.1017,
      "reward": -0.5837467163801193,
      "reward_std": 0.38514454290270805,
      "rewards/cosine_scaled_reward": -0.35437335819005966,
      "rewards/format_reward": 0.1250000037252903,
      "step": 443
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.2537142857142857,
      "grad_norm": 0.12788225710391998,
      "kl": 0.0516357421875,
      "learning_rate": 1.351615817851748e-07,
      "loss": 0.0002,
      "reward": -0.8638512194156647,
      "reward_std": 0.12708378583192825,
      "rewards/cosine_scaled_reward": -0.43192560970783234,
      "rewards/format_reward": 0.0,
      "step": 444
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3210.6666870117188,
      "epoch": 0.2542857142857143,
      "grad_norm": 0.17025582492351532,
      "kl": 0.0570068359375,
      "learning_rate": 1.3395428487445914e-07,
      "loss": -0.0604,
      "reward": 0.14303337410092354,
      "reward_std": 0.35370171954855323,
      "rewards/cosine_scaled_reward": -0.053483348339796066,
      "rewards/format_reward": 0.25,
      "step": 445
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2819.4166870117188,
      "epoch": 0.25485714285714284,
      "grad_norm": 0.522460401058197,
      "kl": 0.06072998046875,
      "learning_rate": 1.3276726544494571e-07,
      "loss": 0.1376,
      "reward": -0.20927497744560242,
      "reward_std": 0.5216442719101906,
      "rewards/cosine_scaled_reward": -0.3129708394408226,
      "rewards/format_reward": 0.4166666828095913,
      "step": 446
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3188.0,
      "epoch": 0.25542857142857145,
      "grad_norm": 0.1497071087360382,
      "kl": 0.0562744140625,
      "learning_rate": 1.316005813502869e-07,
      "loss": 0.0099,
      "reward": 0.11079806089401245,
      "reward_std": 0.2190828826278448,
      "rewards/cosine_scaled_reward": -0.06960096955299377,
      "rewards/format_reward": 0.25,
      "step": 447
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.256,
      "grad_norm": 0.13108882308006287,
      "kl": 0.06268310546875,
      "learning_rate": 1.3045428945301953e-07,
      "loss": 0.0003,
      "reward": 0.10701127722859383,
      "reward_std": 0.24802839010953903,
      "rewards/cosine_scaled_reward": 0.03267230466008186,
      "rewards/format_reward": 0.0416666679084301,
      "step": 448
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3524.125,
      "epoch": 0.25657142857142856,
      "grad_norm": 0.16846035420894623,
      "kl": 0.04693603515625,
      "learning_rate": 1.2932844562179352e-07,
      "loss": 0.0308,
      "reward": 0.05915266275405884,
      "reward_std": 0.8702057525515556,
      "rewards/cosine_scaled_reward": -0.012090334668755531,
      "rewards/format_reward": 0.0833333358168602,
      "step": 449
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2342.3334045410156,
      "epoch": 0.2571428571428571,
      "grad_norm": 0.5184911489486694,
      "kl": 0.0618896484375,
      "learning_rate": 1.2822310472864885e-07,
      "loss": 0.2169,
      "reward": 0.7721665650606155,
      "reward_std": 0.7354639321565628,
      "rewards/cosine_scaled_reward": 0.11524996906518936,
      "rewards/format_reward": 0.5416666716337204,
      "step": 450
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3561.3333740234375,
      "epoch": 0.25771428571428573,
      "grad_norm": 0.22711819410324097,
      "kl": 0.07025146484375,
      "learning_rate": 1.2713832064634125e-07,
      "loss": 0.0134,
      "reward": -0.7618950307369232,
      "reward_std": 0.3447403945028782,
      "rewards/cosine_scaled_reward": -0.4017808511853218,
      "rewards/format_reward": 0.0416666679084301,
      "step": 451
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3037.6250610351562,
      "epoch": 0.2582857142857143,
      "grad_norm": 0.28949570655822754,
      "kl": 0.06170654296875,
      "learning_rate": 1.260741462457165e-07,
      "loss": 0.2172,
      "reward": -0.047480987675953656,
      "reward_std": 0.5772276148200035,
      "rewards/cosine_scaled_reward": -0.1279071494936943,
      "rewards/format_reward": 0.2083333358168602,
      "step": 452
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3401.0000610351562,
      "epoch": 0.25885714285714284,
      "grad_norm": 0.25010257959365845,
      "kl": 0.06524658203125,
      "learning_rate": 1.2503063339313356e-07,
      "loss": 0.0653,
      "reward": -0.24748031795024872,
      "reward_std": 0.6849566511809826,
      "rewards/cosine_scaled_reward": -0.20707351714372635,
      "rewards/format_reward": 0.1666666679084301,
      "step": 453
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3466.9166870117188,
      "epoch": 0.25942857142857145,
      "grad_norm": 0.209227055311203,
      "kl": 0.0743408203125,
      "learning_rate": 1.2400783294793668e-07,
      "loss": 0.0432,
      "reward": -0.3158434331417084,
      "reward_std": 0.6184379309415817,
      "rewards/cosine_scaled_reward": -0.1995883844792843,
      "rewards/format_reward": 0.0833333358168602,
      "step": 454
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.26,
      "grad_norm": 0.19111324846744537,
      "kl": 0.0718994140625,
      "learning_rate": 1.2300579475997657e-07,
      "loss": 0.0003,
      "reward": -0.541237011551857,
      "reward_std": 0.22257393412292004,
      "rewards/cosine_scaled_reward": -0.2706185057759285,
      "rewards/format_reward": 0.0,
      "step": 455
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.26057142857142856,
      "grad_norm": 0.15810757875442505,
      "kl": 0.06884765625,
      "learning_rate": 1.220245676671809e-07,
      "loss": 0.0003,
      "reward": -0.4421476610004902,
      "reward_std": 0.19191257283091545,
      "rewards/cosine_scaled_reward": -0.22107383236289024,
      "rewards/format_reward": 0.0,
      "step": 456
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2937.125,
      "epoch": 0.2611428571428571,
      "grad_norm": 0.2057061344385147,
      "kl": 0.140380859375,
      "learning_rate": 1.2106419949317388e-07,
      "loss": 0.0211,
      "reward": -0.03692680597305298,
      "reward_std": 0.3839663378894329,
      "rewards/cosine_scaled_reward": -0.1434633992612362,
      "rewards/format_reward": 0.25,
      "step": 457
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3399.2083740234375,
      "epoch": 0.26171428571428573,
      "grad_norm": 0.20098821818828583,
      "kl": 0.066162109375,
      "learning_rate": 1.2012473704494537e-07,
      "loss": 0.0564,
      "reward": -0.3796830028295517,
      "reward_std": 0.35559918358922005,
      "rewards/cosine_scaled_reward": -0.2315081711858511,
      "rewards/format_reward": 0.0833333358168602,
      "step": 458
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.2622857142857143,
      "grad_norm": 0.188381627202034,
      "kl": 0.05596923828125,
      "learning_rate": 1.1920622611056974e-07,
      "loss": 0.0002,
      "reward": -0.4935343600809574,
      "reward_std": 0.35022899881005287,
      "rewards/cosine_scaled_reward": -0.2467671800404787,
      "rewards/format_reward": 0.0,
      "step": 459
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2985.9166870117188,
      "epoch": 0.26285714285714284,
      "grad_norm": 0.23509609699249268,
      "kl": 0.06756591796875,
      "learning_rate": 1.1830871145697412e-07,
      "loss": -0.0768,
      "reward": 0.22741861082613468,
      "reward_std": 0.20784308947622776,
      "rewards/cosine_scaled_reward": -0.011290664784610271,
      "rewards/format_reward": 0.25,
      "step": 460
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2779.5417098999023,
      "epoch": 0.2634285714285714,
      "grad_norm": 0.452884316444397,
      "kl": 0.06634521484375,
      "learning_rate": 1.1743223682775649e-07,
      "loss": 0.1131,
      "reward": -0.07000309228897095,
      "reward_std": 0.4777501877397299,
      "rewards/cosine_scaled_reward": -0.20166821405291557,
      "rewards/format_reward": 0.3333333358168602,
      "step": 461
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3556.5,
      "epoch": 0.264,
      "grad_norm": 0.13507026433944702,
      "kl": 0.04974365234375,
      "learning_rate": 1.1657684494105386e-07,
      "loss": 0.0163,
      "reward": -0.47025562822818756,
      "reward_std": 0.5417722593992949,
      "rewards/cosine_scaled_reward": -0.2559611462056637,
      "rewards/format_reward": 0.0416666679084301,
      "step": 462
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.26457142857142857,
      "grad_norm": 0.139174684882164,
      "kl": 0.05755615234375,
      "learning_rate": 1.1574257748745986e-07,
      "loss": 0.0002,
      "reward": -0.5710950195789337,
      "reward_std": 0.528672531247139,
      "rewards/cosine_scaled_reward": -0.30638084560632706,
      "rewards/format_reward": 0.0416666679084301,
      "step": 463
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3249.125,
      "epoch": 0.2651428571428571,
      "grad_norm": 0.16046997904777527,
      "kl": 0.05987548828125,
      "learning_rate": 1.1492947512799328e-07,
      "loss": -0.0441,
      "reward": -0.28501003235578537,
      "reward_std": 0.4225130509585142,
      "rewards/cosine_scaled_reward": -0.24667168036103249,
      "rewards/format_reward": 0.2083333432674408,
      "step": 464
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3547.0,
      "epoch": 0.26571428571428574,
      "grad_norm": 0.1321614384651184,
      "kl": 0.04974365234375,
      "learning_rate": 1.1413757749211602e-07,
      "loss": 0.0219,
      "reward": -0.4046256057918072,
      "reward_std": 0.48913862090557814,
      "rewards/cosine_scaled_reward": -0.22314614709466696,
      "rewards/format_reward": 0.0416666679084301,
      "step": 465
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3458.1250610351562,
      "epoch": 0.2662857142857143,
      "grad_norm": 0.14799264073371887,
      "kl": 0.0499267578125,
      "learning_rate": 1.1336692317580158e-07,
      "loss": 0.0464,
      "reward": -0.34675589203834534,
      "reward_std": 0.5809434130787849,
      "rewards/cosine_scaled_reward": -0.2983779553323984,
      "rewards/format_reward": 0.2500000074505806,
      "step": 466
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.26685714285714285,
      "grad_norm": 0.14886276423931122,
      "kl": 0.05316162109375,
      "learning_rate": 1.1261754973965422e-07,
      "loss": 0.0002,
      "reward": -0.4710244685411453,
      "reward_std": 0.23188624903559685,
      "rewards/cosine_scaled_reward": -0.23551223799586296,
      "rewards/format_reward": 0.0,
      "step": 467
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3387.8333740234375,
      "epoch": 0.2674285714285714,
      "grad_norm": 0.1751989722251892,
      "kl": 0.05670166015625,
      "learning_rate": 1.1188949370707787e-07,
      "loss": -0.0164,
      "reward": -0.15125497430562973,
      "reward_std": 0.4892147481441498,
      "rewards/cosine_scaled_reward": -0.17979413457214832,
      "rewards/format_reward": 0.2083333432674408,
      "step": 468
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.268,
      "grad_norm": 0.14392061531543732,
      "kl": 0.06378173828125,
      "learning_rate": 1.1118279056249653e-07,
      "loss": 0.0003,
      "reward": -0.7428162097930908,
      "reward_std": 0.17959357798099518,
      "rewards/cosine_scaled_reward": -0.3714081048965454,
      "rewards/format_reward": 0.0,
      "step": 469
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2713.6250610351562,
      "epoch": 0.26857142857142857,
      "grad_norm": 0.2141987532377243,
      "kl": 0.0821533203125,
      "learning_rate": 1.1049747474962444e-07,
      "loss": 0.0536,
      "reward": 0.39758430421352386,
      "reward_std": 0.6199628822505474,
      "rewards/cosine_scaled_reward": -0.009541166247799993,
      "rewards/format_reward": 0.4166666716337204,
      "step": 470
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3557.75,
      "epoch": 0.26914285714285713,
      "grad_norm": 0.14714567363262177,
      "kl": 0.0565185546875,
      "learning_rate": 1.0983357966978745e-07,
      "loss": 0.0138,
      "reward": -0.365282267332077,
      "reward_std": 0.6022834666073322,
      "rewards/cosine_scaled_reward": -0.2451411336660385,
      "rewards/format_reward": 0.1250000037252903,
      "step": 471
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3472.7916870117188,
      "epoch": 0.26971428571428574,
      "grad_norm": 0.179129496216774,
      "kl": 0.0621337890625,
      "learning_rate": 1.0919113768029517e-07,
      "loss": 0.0498,
      "reward": -0.3794557135552168,
      "reward_std": 0.3991483077406883,
      "rewards/cosine_scaled_reward": -0.2522278605028987,
      "rewards/format_reward": 0.1250000037252903,
      "step": 472
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3019.2916870117188,
      "epoch": 0.2702857142857143,
      "grad_norm": 0.22841279208660126,
      "kl": 0.06793212890625,
      "learning_rate": 1.0857018009286381e-07,
      "loss": 0.0973,
      "reward": -0.4580535739660263,
      "reward_std": 0.39184647984802723,
      "rewards/cosine_scaled_reward": -0.37486011534929276,
      "rewards/format_reward": 0.2916666679084301,
      "step": 473
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2870.916717529297,
      "epoch": 0.27085714285714285,
      "grad_norm": 0.1689484417438507,
      "kl": 0.0419921875,
      "learning_rate": 1.0797073717209013e-07,
      "loss": -0.0129,
      "reward": 0.28269072622060776,
      "reward_std": 0.2588655799627304,
      "rewards/cosine_scaled_reward": -0.08782129734754562,
      "rewards/format_reward": 0.4583333432674408,
      "step": 474
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3473.5833740234375,
      "epoch": 0.2714285714285714,
      "grad_norm": 0.475265234708786,
      "kl": 0.16973876953125,
      "learning_rate": 1.0739283813397639e-07,
      "loss": 0.0462,
      "reward": -0.11720195040106773,
      "reward_std": 0.746030643582344,
      "rewards/cosine_scaled_reward": -0.14193430170416832,
      "rewards/format_reward": 0.1666666716337204,
      "step": 475
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3468.9583740234375,
      "epoch": 0.272,
      "grad_norm": 0.22294382750988007,
      "kl": 0.06390380859375,
      "learning_rate": 1.068365111445064e-07,
      "loss": 0.0685,
      "reward": -0.8103623390197754,
      "reward_std": 0.16845603752881289,
      "rewards/cosine_scaled_reward": -0.4260144978761673,
      "rewards/format_reward": 0.0416666679084301,
      "step": 476
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.2725714285714286,
      "grad_norm": 0.163096621632576,
      "kl": 0.07647705078125,
      "learning_rate": 1.063017833182728e-07,
      "loss": 0.0003,
      "reward": -0.7045374438166618,
      "reward_std": 0.3001190824434161,
      "rewards/cosine_scaled_reward": -0.393935389816761,
      "rewards/format_reward": 0.0833333358168602,
      "step": 477
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.27314285714285713,
      "grad_norm": 0.16238611936569214,
      "kl": 0.051025390625,
      "learning_rate": 1.0578868071715544e-07,
      "loss": 0.0002,
      "reward": -0.49170275777578354,
      "reward_std": 0.1920237122103572,
      "rewards/cosine_scaled_reward": -0.24585136957466602,
      "rewards/format_reward": 0.0,
      "step": 478
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3314.25,
      "epoch": 0.2737142857142857,
      "grad_norm": 0.16748814284801483,
      "kl": 0.06201171875,
      "learning_rate": 1.0529722834905125e-07,
      "loss": 0.0871,
      "reward": 0.5320697575807571,
      "reward_std": 0.6121283564716578,
      "rewards/cosine_scaled_reward": 0.16186823695898056,
      "rewards/format_reward": 0.2083333358168602,
      "step": 479
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3570.9166870117188,
      "epoch": 0.2742857142857143,
      "grad_norm": 0.15536968410015106,
      "kl": 0.0684814453125,
      "learning_rate": 1.0482745016665526e-07,
      "loss": 0.0078,
      "reward": -0.5202671885490417,
      "reward_std": 0.39154190942645073,
      "rewards/cosine_scaled_reward": -0.2809669245034456,
      "rewards/format_reward": 0.0416666679084301,
      "step": 480
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3291.9583740234375,
      "epoch": 0.27485714285714286,
      "grad_norm": 0.26999354362487793,
      "kl": 0.0859375,
      "learning_rate": 1.0437936906629334e-07,
      "loss": 0.092,
      "reward": -0.4586787410080433,
      "reward_std": 0.26687505654990673,
      "rewards/cosine_scaled_reward": -0.291839387267828,
      "rewards/format_reward": 0.125,
      "step": 481
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3199.75,
      "epoch": 0.2754285714285714,
      "grad_norm": 0.15752355754375458,
      "kl": 0.0611572265625,
      "learning_rate": 1.0395300688680625e-07,
      "loss": 0.0365,
      "reward": -0.32594752311706543,
      "reward_std": 0.4160085953772068,
      "rewards/cosine_scaled_reward": -0.2671404145658016,
      "rewards/format_reward": 0.2083333432674408,
      "step": 482
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3544.6666870117188,
      "epoch": 0.276,
      "grad_norm": 0.14844165742397308,
      "kl": 0.06427001953125,
      "learning_rate": 1.0354838440848501e-07,
      "loss": 0.0233,
      "reward": -0.6612615287303925,
      "reward_std": 0.4578050300478935,
      "rewards/cosine_scaled_reward": -0.3514641039073467,
      "rewards/format_reward": 0.0416666679084301,
      "step": 483
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2735.25,
      "epoch": 0.2765714285714286,
      "grad_norm": 0.2591208815574646,
      "kl": 0.0555419921875,
      "learning_rate": 1.0316552135205837e-07,
      "loss": -0.024,
      "reward": -0.3376142382621765,
      "reward_std": 0.3799719735980034,
      "rewards/cosine_scaled_reward": -0.29380711913108826,
      "rewards/format_reward": 0.2500000111758709,
      "step": 484
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3395.5833740234375,
      "epoch": 0.27714285714285714,
      "grad_norm": 0.1682201325893402,
      "kl": 0.060791015625,
      "learning_rate": 1.0280443637773163e-07,
      "loss": 0.0391,
      "reward": -0.04071862995624542,
      "reward_std": 0.600635758601129,
      "rewards/cosine_scaled_reward": -0.10369264334440231,
      "rewards/format_reward": 0.1666666716337204,
      "step": 485
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2664.500045776367,
      "epoch": 0.2777142857142857,
      "grad_norm": 0.3096904456615448,
      "kl": 0.05816650390625,
      "learning_rate": 1.0246514708427701e-07,
      "loss": 0.0887,
      "reward": -0.291459396481514,
      "reward_std": 0.21652375906705856,
      "rewards/cosine_scaled_reward": -0.29156303964555264,
      "rewards/format_reward": 0.2916666679084301,
      "step": 486
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3101.9166870117188,
      "epoch": 0.2782857142857143,
      "grad_norm": 0.25412291288375854,
      "kl": 0.0621337890625,
      "learning_rate": 1.0214767000817596e-07,
      "loss": 0.1419,
      "reward": 0.23972990177571774,
      "reward_std": 0.9307698365300894,
      "rewards/cosine_scaled_reward": -0.00513505470007658,
      "rewards/format_reward": 0.2500000074505806,
      "step": 487
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3562.4583740234375,
      "epoch": 0.27885714285714286,
      "grad_norm": 0.15202371776103973,
      "kl": 0.05609130859375,
      "learning_rate": 1.0185202062281336e-07,
      "loss": 0.0126,
      "reward": -0.47963531874120235,
      "reward_std": 0.48471274971961975,
      "rewards/cosine_scaled_reward": -0.26065099239349365,
      "rewards/format_reward": 0.0416666679084301,
      "step": 488
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.2794285714285714,
      "grad_norm": 0.16845981776714325,
      "kl": 0.0772705078125,
      "learning_rate": 1.0157821333772304e-07,
      "loss": 0.0003,
      "reward": -0.5912460908293724,
      "reward_std": 0.15500911697745323,
      "rewards/cosine_scaled_reward": -0.2956230528652668,
      "rewards/format_reward": 0.0,
      "step": 489
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2680.6666870117188,
      "epoch": 0.28,
      "grad_norm": 0.2831243872642517,
      "kl": 0.05218505859375,
      "learning_rate": 1.013262614978859e-07,
      "loss": 0.1674,
      "reward": 0.24096882343292236,
      "reward_std": 0.7047604825347662,
      "rewards/cosine_scaled_reward": -0.08784890919923782,
      "rewards/format_reward": 0.4166666865348816,
      "step": 490
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3334.5,
      "epoch": 0.2805714285714286,
      "grad_norm": 0.23872096836566925,
      "kl": 0.06268310546875,
      "learning_rate": 1.0109617738307911e-07,
      "loss": 0.1248,
      "reward": -0.6380931735038757,
      "reward_std": 0.5134330447763205,
      "rewards/cosine_scaled_reward": -0.36071325466036797,
      "rewards/format_reward": 0.0833333358168602,
      "step": 491
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3078.5416870117188,
      "epoch": 0.28114285714285714,
      "grad_norm": 0.46638068556785583,
      "kl": 0.10198974609375,
      "learning_rate": 1.0088797220727779e-07,
      "loss": 0.0973,
      "reward": -0.29000576585531235,
      "reward_std": 0.3600935824215412,
      "rewards/cosine_scaled_reward": -0.2700028717517853,
      "rewards/format_reward": 0.2500000111758709,
      "step": 492
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2935.5833435058594,
      "epoch": 0.2817142857142857,
      "grad_norm": 0.18331332504749298,
      "kl": 0.08758544921875,
      "learning_rate": 1.0070165611810855e-07,
      "loss": 0.0326,
      "reward": 0.09402071312069893,
      "reward_std": 0.43435685709118843,
      "rewards/cosine_scaled_reward": -0.09882297366857529,
      "rewards/format_reward": 0.2916666679084301,
      "step": 493
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3129.5000610351562,
      "epoch": 0.2822857142857143,
      "grad_norm": 0.23339956998825073,
      "kl": 0.07867431640625,
      "learning_rate": 1.005372381963547e-07,
      "loss": 0.1134,
      "reward": 0.17649170011281967,
      "reward_std": 0.6835528574883938,
      "rewards/cosine_scaled_reward": -0.03675416484475136,
      "rewards/format_reward": 0.2500000074505806,
      "step": 494
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.28285714285714286,
      "grad_norm": 0.14486908912658691,
      "kl": 0.05389404296875,
      "learning_rate": 1.0039472645551372e-07,
      "loss": 0.0002,
      "reward": -0.5066226273775101,
      "reward_std": 0.25357529893517494,
      "rewards/cosine_scaled_reward": -0.27414464950561523,
      "rewards/format_reward": 0.0416666679084301,
      "step": 495
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2462.3750610351562,
      "epoch": 0.2834285714285714,
      "grad_norm": 0.4829932749271393,
      "kl": 0.08642578125,
      "learning_rate": 1.002741278414069e-07,
      "loss": 0.1718,
      "reward": 0.3855774737894535,
      "reward_std": 0.946885883808136,
      "rewards/cosine_scaled_reward": -0.07804461070918478,
      "rewards/format_reward": 0.5416666753590107,
      "step": 496
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3563.7916870117188,
      "epoch": 0.284,
      "grad_norm": 0.13713513314723969,
      "kl": 0.047088623046875,
      "learning_rate": 1.0017544823184055e-07,
      "loss": 0.0098,
      "reward": -0.4956064634025097,
      "reward_std": 0.2097947271540761,
      "rewards/cosine_scaled_reward": -0.26863656379282475,
      "rewards/format_reward": 0.0416666679084301,
      "step": 497
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2874.5833435058594,
      "epoch": 0.2845714285714286,
      "grad_norm": 28.739835739135742,
      "kl": 14.1053466796875,
      "learning_rate": 1.0009869243631952e-07,
      "loss": 0.0113,
      "reward": -0.19248086214065552,
      "reward_std": 0.41352982819080353,
      "rewards/cosine_scaled_reward": -0.22124045342206955,
      "rewards/format_reward": 0.25,
      "step": 498
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3546.5,
      "epoch": 0.28514285714285714,
      "grad_norm": 0.15727418661117554,
      "kl": 0.072509765625,
      "learning_rate": 1.000438641958131e-07,
      "loss": 0.0135,
      "reward": -0.3870402202010155,
      "reward_std": 0.4852425046265125,
      "rewards/cosine_scaled_reward": -0.2351867752149701,
      "rewards/format_reward": 0.0833333358168602,
      "step": 499
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3363.375,
      "epoch": 0.2857142857142857,
      "grad_norm": 0.14489901065826416,
      "kl": 0.0672607421875,
      "learning_rate": 1.0001096618257236e-07,
      "loss": 0.0835,
      "reward": -0.4100040141493082,
      "reward_std": 0.22312426194548607,
      "rewards/cosine_scaled_reward": -0.24666867777705193,
      "rewards/format_reward": 0.0833333358168602,
      "step": 500
    },
    {
      "epoch": 0.2857142857142857,
      "step": 500,
      "total_flos": 0.0,
      "train_loss": 0.04149833003999083,
      "train_runtime": 27564.67,
      "train_samples_per_second": 0.435,
      "train_steps_per_second": 0.018
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 6,
  "trial_name": null,
  "trial_params": null
}