{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.2857142857142857,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 3068.5000610351562,
      "epoch": 0.0005714285714285715,
      "grad_norm": 0.08316484093666077,
      "kl": 0.0204010009765625,
      "learning_rate": 0.0,
      "loss": -0.0234,
      "reward": 0.200983926653862,
      "reward_std": 0.24425111338496208,
      "rewards/cosine_scaled_reward": -0.0453413650393486,
      "rewards/format_reward": 0.2916666679084301,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2821.9166717529297,
      "epoch": 0.001142857142857143,
      "grad_norm": 0.08570546656847,
      "kl": 0.011383056640625,
      "learning_rate": 2.0000000000000003e-06,
      "loss": 0.0401,
      "reward": -0.22895785048604012,
      "reward_std": 0.31652447022497654,
      "rewards/cosine_scaled_reward": -0.3019789308309555,
      "rewards/format_reward": 0.375,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3095.1250610351562,
      "epoch": 0.0017142857142857142,
      "grad_norm": 0.12569886445999146,
      "kl": 0.0147705078125,
      "learning_rate": 4.000000000000001e-06,
      "loss": 0.09,
      "reward": 0.7377238147892058,
      "reward_std": 0.8798377588391304,
      "rewards/cosine_scaled_reward": 0.1396951973438263,
      "rewards/format_reward": 0.4583333507180214,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2732.041748046875,
      "epoch": 0.002285714285714286,
      "grad_norm": 0.158451646566391,
      "kl": 0.014007568359375,
      "learning_rate": 6e-06,
      "loss": 0.1068,
      "reward": 0.12815280258655548,
      "reward_std": 0.5621042996644974,
      "rewards/cosine_scaled_reward": -0.22759027034044266,
      "rewards/format_reward": 0.5833333432674408,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3108.0416870117188,
      "epoch": 0.002857142857142857,
      "grad_norm": 0.18162518739700317,
      "kl": 0.017303466796875,
      "learning_rate": 8.000000000000001e-06,
      "loss": 0.1695,
      "reward": -0.3482682505855337,
      "reward_std": 0.40073107928037643,
      "rewards/cosine_scaled_reward": -0.25746746733784676,
      "rewards/format_reward": 0.1666666716337204,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2933.7916717529297,
      "epoch": 0.0034285714285714284,
      "grad_norm": 0.21090327203273773,
      "kl": 0.0165863037109375,
      "learning_rate": 1e-05,
      "loss": 0.1763,
      "reward": -0.33019445836544037,
      "reward_std": 0.4870590269565582,
      "rewards/cosine_scaled_reward": -0.26926389336586,
      "rewards/format_reward": 0.2083333432674408,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2758.4584045410156,
      "epoch": 0.004,
      "grad_norm": 0.14720787107944489,
      "kl": 0.020599365234375,
      "learning_rate": 1.2e-05,
      "loss": 0.2279,
      "reward": -0.05960409715771675,
      "reward_std": 0.259865116328001,
      "rewards/cosine_scaled_reward": -0.19646872207522392,
      "rewards/format_reward": 0.3333333469927311,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2996.2916870117188,
      "epoch": 0.004571428571428572,
      "grad_norm": 0.11497102677822113,
      "kl": 0.0175323486328125,
      "learning_rate": 1.4000000000000001e-05,
      "loss": 0.0293,
      "reward": -0.543822355568409,
      "reward_std": 0.21584158390760422,
      "rewards/cosine_scaled_reward": -0.4177445247769356,
      "rewards/format_reward": 0.2916666679084301,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2414.25,
      "epoch": 0.005142857142857143,
      "grad_norm": 0.0950588658452034,
      "kl": 0.0137939453125,
      "learning_rate": 1.6000000000000003e-05,
      "loss": 0.0196,
      "reward": 0.346224058419466,
      "reward_std": 0.2081431858241558,
      "rewards/cosine_scaled_reward": -0.07688797824084759,
      "rewards/format_reward": 0.5,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3161.8333740234375,
      "epoch": 0.005714285714285714,
      "grad_norm": 0.06650526076555252,
      "kl": 0.011474609375,
      "learning_rate": 1.8e-05,
      "loss": -0.0217,
      "reward": 0.37098103761672974,
      "reward_std": 0.7834450677037239,
      "rewards/cosine_scaled_reward": -0.0020095184445381165,
      "rewards/format_reward": 0.3750000037252903,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2583.500045776367,
      "epoch": 0.006285714285714286,
      "grad_norm": 0.11317435652017593,
      "kl": 0.021820068359375,
      "learning_rate": 2e-05,
      "loss": 0.0819,
      "reward": -0.07037418521940708,
      "reward_std": 0.7221511900424957,
      "rewards/cosine_scaled_reward": -0.26435376331210136,
      "rewards/format_reward": 0.4583333395421505,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3155.3333740234375,
      "epoch": 0.006857142857142857,
      "grad_norm": 0.09496507048606873,
      "kl": 0.0161285400390625,
      "learning_rate": 2.2000000000000003e-05,
      "loss": 0.0429,
      "reward": 0.3653342239558697,
      "reward_std": 0.8600304946303368,
      "rewards/cosine_scaled_reward": 0.016000449657440186,
      "rewards/format_reward": 0.3333333432674408,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3130.2083740234375,
      "epoch": 0.0074285714285714285,
      "grad_norm": 0.10498952120542526,
      "kl": 0.01226806640625,
      "learning_rate": 2.4e-05,
      "loss": 0.1331,
      "reward": 0.41605053562670946,
      "reward_std": 0.8384798839688301,
      "rewards/cosine_scaled_reward": -0.0003080591559410095,
      "rewards/format_reward": 0.4166666828095913,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2837.9166870117188,
      "epoch": 0.008,
      "grad_norm": 0.07714565843343735,
      "kl": 0.0148468017578125,
      "learning_rate": 2.6000000000000002e-05,
      "loss": 0.037,
      "reward": 0.8426800966262817,
      "reward_std": 0.5940273888409138,
      "rewards/cosine_scaled_reward": 0.21300670504570007,
      "rewards/format_reward": 0.4166666716337204,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2945.0834045410156,
      "epoch": 0.008571428571428572,
      "grad_norm": 0.1212492287158966,
      "kl": 0.014190673828125,
      "learning_rate": 2.8000000000000003e-05,
      "loss": 0.2023,
      "reward": -0.19765784591436386,
      "reward_std": 0.4926959238946438,
      "rewards/cosine_scaled_reward": -0.2654956020414829,
      "rewards/format_reward": 0.3333333469927311,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3162.2083740234375,
      "epoch": 0.009142857142857144,
      "grad_norm": 0.08731003850698471,
      "kl": 0.0145416259765625,
      "learning_rate": 3e-05,
      "loss": -0.0229,
      "reward": 0.32796957343816757,
      "reward_std": 0.6958065256476402,
      "rewards/cosine_scaled_reward": -0.044348541647195816,
      "rewards/format_reward": 0.4166666716337204,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.009714285714285713,
      "grad_norm": 0.07292164862155914,
      "kl": 0.0186614990234375,
      "learning_rate": 3.2000000000000005e-05,
      "loss": 0.0007,
      "reward": -0.43665362149477005,
      "reward_std": 0.42871900647878647,
      "rewards/cosine_scaled_reward": -0.2391601405106485,
      "rewards/format_reward": 0.0416666679084301,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3454.8333740234375,
      "epoch": 0.010285714285714285,
      "grad_norm": 0.06652959436178207,
      "kl": 0.0173797607421875,
      "learning_rate": 3.4000000000000007e-05,
      "loss": 0.0322,
      "reward": -0.15206466615200043,
      "reward_std": 0.8579627200961113,
      "rewards/cosine_scaled_reward": -0.1801990047097206,
      "rewards/format_reward": 0.2083333358168602,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3139.5,
      "epoch": 0.010857142857142857,
      "grad_norm": 0.07377547770738602,
      "kl": 0.021087646484375,
      "learning_rate": 3.6e-05,
      "loss": -0.0956,
      "reward": 0.27641918882727623,
      "reward_std": 0.3846270814538002,
      "rewards/cosine_scaled_reward": 0.013209596276283264,
      "rewards/format_reward": 0.2500000111758709,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2240.791732788086,
      "epoch": 0.011428571428571429,
      "grad_norm": 0.11604765802621841,
      "kl": 0.01763916015625,
      "learning_rate": 3.8e-05,
      "loss": -0.0082,
      "reward": 0.5373580157756805,
      "reward_std": 0.6646340787410736,
      "rewards/cosine_scaled_reward": -0.04382099770009518,
      "rewards/format_reward": 0.625,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1933.7500610351562,
      "epoch": 0.012,
      "grad_norm": 0.17429925501346588,
      "kl": 0.0303955078125,
      "learning_rate": 4e-05,
      "loss": 0.2362,
      "reward": 0.5577291771769524,
      "reward_std": 0.8789636418223381,
      "rewards/cosine_scaled_reward": -0.07530209049582481,
      "rewards/format_reward": 0.7083333358168602,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3554.666748046875,
      "epoch": 0.012571428571428572,
      "grad_norm": 0.05839058756828308,
      "kl": 0.0146636962890625,
      "learning_rate": 4.2e-05,
      "loss": 0.0079,
      "reward": 0.32253583520650864,
      "reward_std": 1.1349023096263409,
      "rewards/cosine_scaled_reward": 0.01543455570936203,
      "rewards/format_reward": 0.2916666753590107,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2841.666717529297,
      "epoch": 0.013142857142857144,
      "grad_norm": 0.1242748275399208,
      "kl": 0.01934814453125,
      "learning_rate": 4.4000000000000006e-05,
      "loss": 0.0414,
      "reward": 0.6808896251022816,
      "reward_std": 0.757483784109354,
      "rewards/cosine_scaled_reward": 0.09044479578733444,
      "rewards/format_reward": 0.5000000074505806,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2709.6666870117188,
      "epoch": 0.013714285714285714,
      "grad_norm": 0.08774807304143906,
      "kl": 0.018585205078125,
      "learning_rate": 4.600000000000001e-05,
      "loss": 0.0703,
      "reward": 0.7750062793493271,
      "reward_std": 0.5958304777741432,
      "rewards/cosine_scaled_reward": 0.11666978895664215,
      "rewards/format_reward": 0.5416666679084301,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2390.125030517578,
      "epoch": 0.014285714285714285,
      "grad_norm": 0.08277406543493271,
      "kl": 0.015777587890625,
      "learning_rate": 4.8e-05,
      "loss": 0.0432,
      "reward": 1.0784958824515343,
      "reward_std": 0.4524005614221096,
      "rewards/cosine_scaled_reward": 0.18508130311965942,
      "rewards/format_reward": 0.7083333432674408,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3135.791717529297,
      "epoch": 0.014857142857142857,
      "grad_norm": 0.0981830507516861,
      "kl": 0.0186614990234375,
      "learning_rate": 5e-05,
      "loss": -0.0303,
      "reward": -0.14081082493066788,
      "reward_std": 0.684042863547802,
      "rewards/cosine_scaled_reward": -0.23707208782434464,
      "rewards/format_reward": 0.3333333358168602,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3550.1666870117188,
      "epoch": 0.015428571428571429,
      "grad_norm": 0.08542604744434357,
      "kl": 0.0210418701171875,
      "learning_rate": 5.2000000000000004e-05,
      "loss": 0.0193,
      "reward": -0.24810760095715523,
      "reward_std": 0.5632593892514706,
      "rewards/cosine_scaled_reward": -0.16572047024965286,
      "rewards/format_reward": 0.0833333358168602,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3004.2500610351562,
      "epoch": 0.016,
      "grad_norm": 0.1371905505657196,
      "kl": 0.024505615234375,
      "learning_rate": 5.4000000000000005e-05,
      "loss": 0.1257,
      "reward": 0.41203732788562775,
      "reward_std": 0.9210044294595718,
      "rewards/cosine_scaled_reward": -0.0023146718740463257,
      "rewards/format_reward": 0.416666679084301,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2946.9583740234375,
      "epoch": 0.01657142857142857,
      "grad_norm": 0.08096691220998764,
      "kl": 0.023193359375,
      "learning_rate": 5.6000000000000006e-05,
      "loss": -0.026,
      "reward": 0.13879438489675522,
      "reward_std": 0.6071850284934044,
      "rewards/cosine_scaled_reward": -0.07643614336848259,
      "rewards/format_reward": 0.2916666679084301,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3571.5833740234375,
      "epoch": 0.017142857142857144,
      "grad_norm": 0.06950397789478302,
      "kl": 0.0203857421875,
      "learning_rate": 5.8e-05,
      "loss": 0.008,
      "reward": -0.10841021686792374,
      "reward_std": 1.061239955946803,
      "rewards/cosine_scaled_reward": -0.13753844052553177,
      "rewards/format_reward": 0.1666666716337204,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2661.9583740234375,
      "epoch": 0.017714285714285714,
      "grad_norm": 0.13690024614334106,
      "kl": 0.021331787109375,
      "learning_rate": 6e-05,
      "loss": 0.069,
      "reward": 0.4652084708213806,
      "reward_std": 0.5595494862645864,
      "rewards/cosine_scaled_reward": 0.024270888417959213,
      "rewards/format_reward": 0.4166666716337204,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2511.9583740234375,
      "epoch": 0.018285714285714287,
      "grad_norm": 0.07431714236736298,
      "kl": 0.020538330078125,
      "learning_rate": 6.2e-05,
      "loss": 0.0788,
      "reward": 0.8832313418388367,
      "reward_std": 0.708160936832428,
      "rewards/cosine_scaled_reward": 0.14994902536273003,
      "rewards/format_reward": 0.5833333432674408,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.018857142857142857,
      "grad_norm": 0.08026785403490067,
      "kl": 0.02886962890625,
      "learning_rate": 6.400000000000001e-05,
      "loss": 0.0012,
      "reward": -0.6131787896156311,
      "reward_std": 0.201310433447361,
      "rewards/cosine_scaled_reward": -0.30658938735723495,
      "rewards/format_reward": 0.0,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2966.3333435058594,
      "epoch": 0.019428571428571427,
      "grad_norm": 0.10494574159383774,
      "kl": 0.024169921875,
      "learning_rate": 6.6e-05,
      "loss": -0.0606,
      "reward": -0.22858721017837524,
      "reward_std": 0.3569427113980055,
      "rewards/cosine_scaled_reward": -0.2601269483566284,
      "rewards/format_reward": 0.2916666679084301,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.02,
      "grad_norm": 0.07825391739606857,
      "kl": 0.027740478515625,
      "learning_rate": 6.800000000000001e-05,
      "loss": 0.0011,
      "reward": -0.7152878791093826,
      "reward_std": 0.29954793583601713,
      "rewards/cosine_scaled_reward": -0.3576439470052719,
      "rewards/format_reward": 0.0,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.02057142857142857,
      "grad_norm": 0.07421068102121353,
      "kl": 0.029144287109375,
      "learning_rate": 7e-05,
      "loss": 0.0012,
      "reward": -0.4027569368481636,
      "reward_std": 0.21831603534519672,
      "rewards/cosine_scaled_reward": -0.20137847773730755,
      "rewards/format_reward": 0.0,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3572.5416870117188,
      "epoch": 0.021142857142857144,
      "grad_norm": 0.057787228375673294,
      "kl": 0.022430419921875,
      "learning_rate": 7.2e-05,
      "loss": 0.0044,
      "reward": -0.1326567530632019,
      "reward_std": 0.5722145922482014,
      "rewards/cosine_scaled_reward": -0.1288283858448267,
      "rewards/format_reward": 0.125,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3340.375,
      "epoch": 0.021714285714285714,
      "grad_norm": 0.06777527928352356,
      "kl": 0.027099609375,
      "learning_rate": 7.4e-05,
      "loss": -0.021,
      "reward": -0.35966064035892487,
      "reward_std": 0.3422342501580715,
      "rewards/cosine_scaled_reward": -0.30483032763004303,
      "rewards/format_reward": 0.25,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2416.291717529297,
      "epoch": 0.022285714285714287,
      "grad_norm": 0.10955022275447845,
      "kl": 0.0400390625,
      "learning_rate": 7.6e-05,
      "loss": 0.0797,
      "reward": 0.43888016045093536,
      "reward_std": 0.6578193679451942,
      "rewards/cosine_scaled_reward": -0.05139327887445688,
      "rewards/format_reward": 0.541666679084301,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3547.3333740234375,
      "epoch": 0.022857142857142857,
      "grad_norm": 0.07015793770551682,
      "kl": 0.0251922607421875,
      "learning_rate": 7.800000000000001e-05,
      "loss": 0.0054,
      "reward": -0.40920185297727585,
      "reward_std": 0.4526283470913768,
      "rewards/cosine_scaled_reward": -0.2671009246259928,
      "rewards/format_reward": 0.125,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2995.75,
      "epoch": 0.023428571428571427,
      "grad_norm": 0.07032545655965805,
      "kl": 0.0225830078125,
      "learning_rate": 8e-05,
      "loss": -0.0094,
      "reward": -0.155843585729599,
      "reward_std": 0.10809195134788752,
      "rewards/cosine_scaled_reward": -0.2029217779636383,
      "rewards/format_reward": 0.25,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2693.4166717529297,
      "epoch": 0.024,
      "grad_norm": 0.06467006355524063,
      "kl": 0.0238800048828125,
      "learning_rate": 8.2e-05,
      "loss": 0.0134,
      "reward": 0.43927521631121635,
      "reward_std": 0.6234664730727673,
      "rewards/cosine_scaled_reward": -0.009529059752821922,
      "rewards/format_reward": 0.4583333395421505,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.02457142857142857,
      "grad_norm": 0.06509877741336823,
      "kl": 0.02996826171875,
      "learning_rate": 8.4e-05,
      "loss": 0.0012,
      "reward": -0.6067558601498604,
      "reward_std": 0.1665392592549324,
      "rewards/cosine_scaled_reward": -0.3033779449760914,
      "rewards/format_reward": 0.0,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3574.125,
      "epoch": 0.025142857142857144,
      "grad_norm": 0.06305427849292755,
      "kl": 0.027435302734375,
      "learning_rate": 8.6e-05,
      "loss": 0.004,
      "reward": -0.23581353574991226,
      "reward_std": 0.6179232448339462,
      "rewards/cosine_scaled_reward": -0.18040677905082703,
      "rewards/format_reward": 0.125,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3155.125,
      "epoch": 0.025714285714285714,
      "grad_norm": 0.10693054646253586,
      "kl": 0.0302734375,
      "learning_rate": 8.800000000000001e-05,
      "loss": 0.1501,
      "reward": -0.355922631919384,
      "reward_std": 0.56600271910429,
      "rewards/cosine_scaled_reward": -0.2821279801428318,
      "rewards/format_reward": 0.2083333395421505,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3348.875,
      "epoch": 0.026285714285714287,
      "grad_norm": 0.08973235636949539,
      "kl": 0.027587890625,
      "learning_rate": 9e-05,
      "loss": 0.087,
      "reward": -0.17564061796292663,
      "reward_std": 0.6632250510156155,
      "rewards/cosine_scaled_reward": -0.19198699295520782,
      "rewards/format_reward": 0.2083333358168602,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2860.0833740234375,
      "epoch": 0.026857142857142857,
      "grad_norm": 0.0798032283782959,
      "kl": 0.023529052734375,
      "learning_rate": 9.200000000000001e-05,
      "loss": 0.0161,
      "reward": 0.2136048525571823,
      "reward_std": 0.788389652967453,
      "rewards/cosine_scaled_reward": -0.05986428260803223,
      "rewards/format_reward": 0.3333333358168602,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2709.3333740234375,
      "epoch": 0.027428571428571427,
      "grad_norm": 0.10712946206331253,
      "kl": 0.0269775390625,
      "learning_rate": 9.4e-05,
      "loss": 0.2225,
      "reward": 0.7221578508615494,
      "reward_std": 0.7804624438285828,
      "rewards/cosine_scaled_reward": 0.04857892170548439,
      "rewards/format_reward": 0.6250000149011612,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2863.666717529297,
      "epoch": 0.028,
      "grad_norm": 0.10389462858438492,
      "kl": 0.03228759765625,
      "learning_rate": 9.6e-05,
      "loss": 0.1357,
      "reward": 0.5737984776496887,
      "reward_std": 0.6438678838312626,
      "rewards/cosine_scaled_reward": 0.07856592535972595,
      "rewards/format_reward": 0.4166666865348816,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3154.916748046875,
      "epoch": 0.02857142857142857,
      "grad_norm": 0.1013183668255806,
      "kl": 0.0369873046875,
      "learning_rate": 9.8e-05,
      "loss": 0.1183,
      "reward": 0.13152291253209114,
      "reward_std": 0.5646936669945717,
      "rewards/cosine_scaled_reward": -0.10090522468090057,
      "rewards/format_reward": 0.3333333432674408,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3158.9583435058594,
      "epoch": 0.029142857142857144,
      "grad_norm": 0.07180243730545044,
      "kl": 0.033721923828125,
      "learning_rate": 0.0001,
      "loss": -0.0368,
      "reward": 0.07173049449920654,
      "reward_std": 0.21982344426214695,
      "rewards/cosine_scaled_reward": -0.08913473784923553,
      "rewards/format_reward": 0.25,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3562.7083740234375,
      "epoch": 0.029714285714285714,
      "grad_norm": 0.06174299493432045,
      "kl": 0.02374267578125,
      "learning_rate": 9.999890338174276e-05,
      "loss": 0.0131,
      "reward": -0.344110494479537,
      "reward_std": 0.8078071549534798,
      "rewards/cosine_scaled_reward": -0.23455525189638138,
      "rewards/format_reward": 0.1250000037252903,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3081.625030517578,
      "epoch": 0.030285714285714287,
      "grad_norm": 0.073664590716362,
      "kl": 0.0404052734375,
      "learning_rate": 9.999561358041869e-05,
      "loss": 0.0109,
      "reward": 0.42618853598833084,
      "reward_std": 0.5039119943976402,
      "rewards/cosine_scaled_reward": 0.06726095359772444,
      "rewards/format_reward": 0.2916666679084301,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3388.2083740234375,
      "epoch": 0.030857142857142857,
      "grad_norm": 0.0838247761130333,
      "kl": 0.04852294921875,
      "learning_rate": 9.999013075636805e-05,
      "loss": 0.0601,
      "reward": -0.505183070898056,
      "reward_std": 0.41321277990937233,
      "rewards/cosine_scaled_reward": -0.3359248712658882,
      "rewards/format_reward": 0.1666666679084301,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3401.75,
      "epoch": 0.03142857142857143,
      "grad_norm": 0.07153363525867462,
      "kl": 0.048828125,
      "learning_rate": 9.998245517681595e-05,
      "loss": 0.0337,
      "reward": 0.6791047602891922,
      "reward_std": 0.6441808789968491,
      "rewards/cosine_scaled_reward": 0.1312190592288971,
      "rewards/format_reward": 0.4166666716337204,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3383.5416870117188,
      "epoch": 0.032,
      "grad_norm": 0.2566574811935425,
      "kl": 0.1033935546875,
      "learning_rate": 9.997258721585931e-05,
      "loss": 0.1083,
      "reward": -0.4387590363621712,
      "reward_std": 0.3598366603255272,
      "rewards/cosine_scaled_reward": -0.2610462047159672,
      "rewards/format_reward": 0.0833333358168602,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3268.0416870117188,
      "epoch": 0.03257142857142857,
      "grad_norm": 0.11019923537969589,
      "kl": 0.04718017578125,
      "learning_rate": 9.996052735444863e-05,
      "loss": 0.0445,
      "reward": 0.07900557294487953,
      "reward_std": 0.7729422375559807,
      "rewards/cosine_scaled_reward": -0.08549723774194717,
      "rewards/format_reward": 0.2500000111758709,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3465.5,
      "epoch": 0.03314285714285714,
      "grad_norm": 0.08362831175327301,
      "kl": 0.05291748046875,
      "learning_rate": 9.994627618036454e-05,
      "loss": 0.0591,
      "reward": -0.44299469888210297,
      "reward_std": 0.6330600045621395,
      "rewards/cosine_scaled_reward": -0.28399735875427723,
      "rewards/format_reward": 0.1250000037252903,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3503.9583740234375,
      "epoch": 0.03371428571428572,
      "grad_norm": 0.08290518820285797,
      "kl": 0.0670166015625,
      "learning_rate": 9.992983438818914e-05,
      "loss": 0.01,
      "reward": -0.33126915991306305,
      "reward_std": 0.5302771776914597,
      "rewards/cosine_scaled_reward": -0.22813457623124123,
      "rewards/format_reward": 0.125,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.03428571428571429,
      "grad_norm": 0.06376607716083527,
      "kl": 0.055938720703125,
      "learning_rate": 9.991120277927223e-05,
      "loss": 0.0022,
      "reward": -0.5242063365876675,
      "reward_std": 0.24296396784484386,
      "rewards/cosine_scaled_reward": -0.2829365022480488,
      "rewards/format_reward": 0.0416666679084301,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2896.500030517578,
      "epoch": 0.03485714285714286,
      "grad_norm": 0.10472023487091064,
      "kl": 0.0540771484375,
      "learning_rate": 9.989038226169209e-05,
      "loss": 0.0943,
      "reward": 0.12865129858255386,
      "reward_std": 1.051661066710949,
      "rewards/cosine_scaled_reward": -0.16484103631228209,
      "rewards/format_reward": 0.4583333395421505,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2667.5416717529297,
      "epoch": 0.03542857142857143,
      "grad_norm": 17.001340866088867,
      "kl": 1.73077392578125,
      "learning_rate": 9.986737385021142e-05,
      "loss": 0.1479,
      "reward": -0.36982037127017975,
      "reward_std": 0.38738980889320374,
      "rewards/cosine_scaled_reward": -0.3515768498182297,
      "rewards/format_reward": 0.3333333358168602,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3583.3333740234375,
      "epoch": 0.036,
      "grad_norm": 0.07674090564250946,
      "kl": 0.0684814453125,
      "learning_rate": 9.98421786662277e-05,
      "loss": 0.0031,
      "reward": 0.01588384434580803,
      "reward_std": 0.9027550332248211,
      "rewards/cosine_scaled_reward": -0.07539140060544014,
      "rewards/format_reward": 0.1666666716337204,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3046.6666870117188,
      "epoch": 0.036571428571428574,
      "grad_norm": 0.25831571221351624,
      "kl": 0.06427001953125,
      "learning_rate": 9.981479793771866e-05,
      "loss": 0.212,
      "reward": -0.42657897621393204,
      "reward_std": 0.26655818335711956,
      "rewards/cosine_scaled_reward": -0.3174561560153961,
      "rewards/format_reward": 0.2083333358168602,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3522.0416870117188,
      "epoch": 0.037142857142857144,
      "grad_norm": 0.0953318402171135,
      "kl": 0.1075439453125,
      "learning_rate": 9.97852329991824e-05,
      "loss": 0.0295,
      "reward": -0.37634188309311867,
      "reward_std": 0.3436935096979141,
      "rewards/cosine_scaled_reward": -0.22983760759234428,
      "rewards/format_reward": 0.0833333358168602,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3214.0000610351562,
      "epoch": 0.037714285714285714,
      "grad_norm": 0.2412433624267578,
      "kl": 0.08349609375,
      "learning_rate": 9.97534852915723e-05,
      "loss": 0.0985,
      "reward": 0.3821183741092682,
      "reward_std": 1.0826219320297241,
      "rewards/cosine_scaled_reward": -0.038107482716441154,
      "rewards/format_reward": 0.4583333432674408,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3045.5000610351562,
      "epoch": 0.038285714285714284,
      "grad_norm": 0.11741996556520462,
      "kl": 0.1163330078125,
      "learning_rate": 9.971955636222684e-05,
      "loss": 0.0296,
      "reward": 0.2733103707432747,
      "reward_std": 1.0267937406897545,
      "rewards/cosine_scaled_reward": -0.05084482580423355,
      "rewards/format_reward": 0.3750000037252903,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3584.0,
      "epoch": 0.038857142857142854,
      "grad_norm": 0.1156265065073967,
      "kl": 0.1378173828125,
      "learning_rate": 9.968344786479416e-05,
      "loss": 0.0055,
      "reward": -0.5511204618960619,
      "reward_std": 0.14900160022079945,
      "rewards/cosine_scaled_reward": -0.27556023094803095,
      "rewards/format_reward": 0.0,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3256.9166870117188,
      "epoch": 0.03942857142857143,
      "grad_norm": 0.14248953759670258,
      "kl": 0.20166015625,
      "learning_rate": 9.964516155915151e-05,
      "loss": 0.0011,
      "reward": -0.09149269759654999,
      "reward_std": 0.4294120315462351,
      "rewards/cosine_scaled_reward": -0.1707463413476944,
      "rewards/format_reward": 0.25,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3430.25,
      "epoch": 0.04,
      "grad_norm": 0.26481300592422485,
      "kl": 0.226318359375,
      "learning_rate": 9.960469931131939e-05,
      "loss": 0.061,
      "reward": -0.2762333448044956,
      "reward_std": 0.6956104636192322,
      "rewards/cosine_scaled_reward": -0.24228334799408913,
      "rewards/format_reward": 0.2083333358168602,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2133.9583435058594,
      "epoch": 0.04057142857142857,
      "grad_norm": 0.55636066198349,
      "kl": 0.196533203125,
      "learning_rate": 9.956206309337068e-05,
      "loss": 0.1912,
      "reward": 0.6508506219834089,
      "reward_std": 0.7065669223666191,
      "rewards/cosine_scaled_reward": 0.012925267219543457,
      "rewards/format_reward": 0.6250000149011612,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3280.0000610351562,
      "epoch": 0.04114285714285714,
      "grad_norm": 0.2733776569366455,
      "kl": 0.326416015625,
      "learning_rate": 9.951725498333448e-05,
      "loss": 0.089,
      "reward": 0.04937553819036111,
      "reward_std": 0.650901660323143,
      "rewards/cosine_scaled_reward": -0.10031222924590111,
      "rewards/format_reward": 0.2500000074505806,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3007.9166870117188,
      "epoch": 0.04171428571428572,
      "grad_norm": 0.3959416151046753,
      "kl": 0.407958984375,
      "learning_rate": 9.947027716509488e-05,
      "loss": 0.1308,
      "reward": 0.21741341799497604,
      "reward_std": 0.7383521795272827,
      "rewards/cosine_scaled_reward": -0.16212662309408188,
      "rewards/format_reward": 0.5416666865348816,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3514.2083740234375,
      "epoch": 0.04228571428571429,
      "grad_norm": 0.2642626464366913,
      "kl": 0.671875,
      "learning_rate": 9.942113192828445e-05,
      "loss": 0.0673,
      "reward": -0.7055738568305969,
      "reward_std": 0.5232572704553604,
      "rewards/cosine_scaled_reward": -0.39445359259843826,
      "rewards/format_reward": 0.0833333358168602,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2784.0000610351562,
      "epoch": 0.04285714285714286,
      "grad_norm": 0.5180374383926392,
      "kl": 0.708984375,
      "learning_rate": 9.936982166817273e-05,
      "loss": 0.1735,
      "reward": 0.5747006963938475,
      "reward_std": 0.956249326467514,
      "rewards/cosine_scaled_reward": -0.06681633368134499,
      "rewards/format_reward": 0.7083333507180214,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3248.041748046875,
      "epoch": 0.04342857142857143,
      "grad_norm": 0.46085768938064575,
      "kl": 0.947265625,
      "learning_rate": 9.931634888554937e-05,
      "loss": 0.1319,
      "reward": 0.21069841086864471,
      "reward_std": 0.9020620584487915,
      "rewards/cosine_scaled_reward": -0.2071508066728711,
      "rewards/format_reward": 0.6250000149011612,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2514.75,
      "epoch": 0.044,
      "grad_norm": 0.5324290990829468,
      "kl": 1.1328125,
      "learning_rate": 9.926071618660238e-05,
      "loss": 0.0618,
      "reward": 0.7419831641018391,
      "reward_std": 0.5870551839470863,
      "rewards/cosine_scaled_reward": -0.08734174817800522,
      "rewards/format_reward": 0.9166666716337204,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2806.041748046875,
      "epoch": 0.044571428571428574,
      "grad_norm": 0.5151541233062744,
      "kl": 0.9423828125,
      "learning_rate": 9.920292628279099e-05,
      "loss": 0.2011,
      "reward": 1.1562666706740856,
      "reward_std": 0.9286830723285675,
      "rewards/cosine_scaled_reward": 0.14063331112265587,
      "rewards/format_reward": 0.8750000149011612,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2392.416748046875,
      "epoch": 0.045142857142857144,
      "grad_norm": 0.38864853978157043,
      "kl": 0.93310546875,
      "learning_rate": 9.914298199071362e-05,
      "loss": 0.0511,
      "reward": 0.5092507172375917,
      "reward_std": 0.6603717654943466,
      "rewards/cosine_scaled_reward": -0.16204129718244076,
      "rewards/format_reward": 0.8333333432674408,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2799.0001220703125,
      "epoch": 0.045714285714285714,
      "grad_norm": 0.33745047450065613,
      "kl": 1.154296875,
      "learning_rate": 9.908088623197048e-05,
      "loss": 0.1739,
      "reward": 0.49583832919597626,
      "reward_std": 0.5688673853874207,
      "rewards/cosine_scaled_reward": -0.21041417494416237,
      "rewards/format_reward": 0.9166666865348816,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2742.9584350585938,
      "epoch": 0.046285714285714284,
      "grad_norm": 0.6719815135002136,
      "kl": 1.2353515625,
      "learning_rate": 9.901664203302126e-05,
      "loss": 0.0793,
      "reward": 0.8690863847732544,
      "reward_std": 0.67031354829669,
      "rewards/cosine_scaled_reward": -0.04462350904941559,
      "rewards/format_reward": 0.9583333432674408,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2788.166748046875,
      "epoch": 0.046857142857142854,
      "grad_norm": 0.5158824920654297,
      "kl": 0.869140625,
      "learning_rate": 9.895025252503756e-05,
      "loss": 0.0128,
      "reward": 0.9047548621892929,
      "reward_std": 0.6626572608947754,
      "rewards/cosine_scaled_reward": -0.04762259125709534,
      "rewards/format_reward": 1.0,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1382.4583587646484,
      "epoch": 0.04742857142857143,
      "grad_norm": 0.27055126428604126,
      "kl": 0.299072265625,
      "learning_rate": 9.888172094375034e-05,
      "loss": 0.0848,
      "reward": 1.0674683526158333,
      "reward_std": 0.4278734102845192,
      "rewards/cosine_scaled_reward": 0.03373415768146515,
      "rewards/format_reward": 1.0,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2475.666717529297,
      "epoch": 0.048,
      "grad_norm": 0.20611289143562317,
      "kl": 0.50341796875,
      "learning_rate": 9.881105062929221e-05,
      "loss": 0.0433,
      "reward": 1.037320300936699,
      "reward_std": 1.0173222571611404,
      "rewards/cosine_scaled_reward": 0.08116012637037784,
      "rewards/format_reward": 0.8750000149011612,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2356.3333740234375,
      "epoch": 0.04857142857142857,
      "grad_norm": 0.25637194514274597,
      "kl": 0.556884765625,
      "learning_rate": 9.87382450260346e-05,
      "loss": 0.0248,
      "reward": 1.1020738258957863,
      "reward_std": 0.6897303387522697,
      "rewards/cosine_scaled_reward": 0.05103694926947355,
      "rewards/format_reward": 1.0,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3392.2501220703125,
      "epoch": 0.04914285714285714,
      "grad_norm": 0.18801869451999664,
      "kl": 0.587890625,
      "learning_rate": 9.866330768241984e-05,
      "loss": 0.0602,
      "reward": 0.8699261844158173,
      "reward_std": 1.160498969256878,
      "rewards/cosine_scaled_reward": 0.08079641312360764,
      "rewards/format_reward": 0.7083333507180214,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3220.041748046875,
      "epoch": 0.04971428571428571,
      "grad_norm": 0.238076850771904,
      "kl": 0.57275390625,
      "learning_rate": 9.858624225078841e-05,
      "loss": 0.0654,
      "reward": 0.5753591060638428,
      "reward_std": 0.5094204191118479,
      "rewards/cosine_scaled_reward": -0.045653849840164185,
      "rewards/format_reward": 0.6666666865348816,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2765.9583740234375,
      "epoch": 0.05028571428571429,
      "grad_norm": 0.1997767835855484,
      "kl": 0.48583984375,
      "learning_rate": 9.850705248720069e-05,
      "loss": 0.0971,
      "reward": 0.47976822033524513,
      "reward_std": 0.6232990622520447,
      "rewards/cosine_scaled_reward": -0.17678256519138813,
      "rewards/format_reward": 0.8333333358168602,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3000.875030517578,
      "epoch": 0.05085714285714286,
      "grad_norm": 0.189803808927536,
      "kl": 0.403564453125,
      "learning_rate": 9.842574225125401e-05,
      "loss": 0.017,
      "reward": 0.1426578164100647,
      "reward_std": 0.8176284991204739,
      "rewards/cosine_scaled_reward": -0.13700442761182785,
      "rewards/format_reward": 0.4166666716337204,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2594.2084350585938,
      "epoch": 0.05142857142857143,
      "grad_norm": 0.44425904750823975,
      "kl": 0.220947265625,
      "learning_rate": 9.834231550589462e-05,
      "loss": 0.0654,
      "reward": 0.7554673850536346,
      "reward_std": 1.3941691219806671,
      "rewards/cosine_scaled_reward": 0.04440037161111832,
      "rewards/format_reward": 0.666666679084301,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3254.916748046875,
      "epoch": 0.052,
      "grad_norm": 0.19816897809505463,
      "kl": 0.38525390625,
      "learning_rate": 9.825677631722435e-05,
      "loss": 0.0586,
      "reward": 0.5828766226768494,
      "reward_std": 0.8098243772983551,
      "rewards/cosine_scaled_reward": -0.06272834818810225,
      "rewards/format_reward": 0.7083333507180214,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2826.291717529297,
      "epoch": 0.052571428571428575,
      "grad_norm": 0.21156969666481018,
      "kl": 0.36474609375,
      "learning_rate": 9.816912885430258e-05,
      "loss": 0.0932,
      "reward": 0.22245215624570847,
      "reward_std": 0.7063884437084198,
      "rewards/cosine_scaled_reward": -0.20127389580011368,
      "rewards/format_reward": 0.625,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2395.6666870117188,
      "epoch": 0.053142857142857144,
      "grad_norm": 0.2171396166086197,
      "kl": 0.26220703125,
      "learning_rate": 9.807937738894303e-05,
      "loss": 0.0753,
      "reward": 0.8515213429927826,
      "reward_std": 0.9921551188454032,
      "rewards/cosine_scaled_reward": 0.11326067708432674,
      "rewards/format_reward": 0.625,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3411.2500610351562,
      "epoch": 0.053714285714285714,
      "grad_norm": 0.27403607964515686,
      "kl": 0.39208984375,
      "learning_rate": 9.798752629550546e-05,
      "loss": 0.0915,
      "reward": -0.09905853308737278,
      "reward_std": 0.7048804685473442,
      "rewards/cosine_scaled_reward": -0.2578625986352563,
      "rewards/format_reward": 0.416666679084301,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3139.2501220703125,
      "epoch": 0.054285714285714284,
      "grad_norm": 0.2699209153652191,
      "kl": 0.39892578125,
      "learning_rate": 9.789358005068262e-05,
      "loss": -0.0192,
      "reward": 0.33204662054777145,
      "reward_std": 0.8725630715489388,
      "rewards/cosine_scaled_reward": -0.16731002740561962,
      "rewards/format_reward": 0.666666679084301,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3406.0833740234375,
      "epoch": 0.054857142857142854,
      "grad_norm": 0.2241956889629364,
      "kl": 0.4150390625,
      "learning_rate": 9.779754323328192e-05,
      "loss": 0.0288,
      "reward": 0.29199653305113316,
      "reward_std": 0.6021534074097872,
      "rewards/cosine_scaled_reward": -0.10400174837559462,
      "rewards/format_reward": 0.5,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2894.8333435058594,
      "epoch": 0.05542857142857143,
      "grad_norm": 0.20786413550376892,
      "kl": 0.424560546875,
      "learning_rate": 9.769942052400235e-05,
      "loss": 0.0021,
      "reward": 0.17632445320487022,
      "reward_std": 0.2286351751536131,
      "rewards/cosine_scaled_reward": -0.16183778084814548,
      "rewards/format_reward": 0.5,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2208.1666870117188,
      "epoch": 0.056,
      "grad_norm": 0.20279790461063385,
      "kl": 0.3033447265625,
      "learning_rate": 9.759921670520634e-05,
      "loss": 0.0457,
      "reward": 1.092903109267354,
      "reward_std": 0.9870968163013458,
      "rewards/cosine_scaled_reward": 0.08811822533607483,
      "rewards/format_reward": 0.9166666716337204,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2346.5833587646484,
      "epoch": 0.05657142857142857,
      "grad_norm": 0.3798772394657135,
      "kl": 0.329345703125,
      "learning_rate": 9.749693666068664e-05,
      "loss": 0.1025,
      "reward": 0.7225791215896606,
      "reward_std": 0.9258620589971542,
      "rewards/cosine_scaled_reward": -0.03454381600022316,
      "rewards/format_reward": 0.791666679084301,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1839.2500610351562,
      "epoch": 0.05714285714285714,
      "grad_norm": 0.2397909164428711,
      "kl": 0.238525390625,
      "learning_rate": 9.739258537542835e-05,
      "loss": 0.0492,
      "reward": 1.0250511392951012,
      "reward_std": 0.7990377843379974,
      "rewards/cosine_scaled_reward": 0.012525551952421665,
      "rewards/format_reward": 1.0,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3216.58349609375,
      "epoch": 0.05771428571428571,
      "grad_norm": 0.22909773886203766,
      "kl": 0.4794921875,
      "learning_rate": 9.728616793536588e-05,
      "loss": 0.0818,
      "reward": 0.23036185838282108,
      "reward_std": 0.5441985353827477,
      "rewards/cosine_scaled_reward": -0.23898574337363243,
      "rewards/format_reward": 0.7083333358168602,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3036.4583740234375,
      "epoch": 0.05828571428571429,
      "grad_norm": 0.22612375020980835,
      "kl": 0.576416015625,
      "learning_rate": 9.717768952713513e-05,
      "loss": 0.0668,
      "reward": 0.42826264537870884,
      "reward_std": 0.4419846907258034,
      "rewards/cosine_scaled_reward": -0.18170202150940895,
      "rewards/format_reward": 0.7916666716337204,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3196.041748046875,
      "epoch": 0.05885714285714286,
      "grad_norm": 0.32880789041519165,
      "kl": 0.55517578125,
      "learning_rate": 9.706715543782064e-05,
      "loss": 0.0338,
      "reward": 0.6674360632896423,
      "reward_std": 0.6189832799136639,
      "rewards/cosine_scaled_reward": -0.12461531162261963,
      "rewards/format_reward": 0.9166666865348816,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2930.5833740234375,
      "epoch": 0.05942857142857143,
      "grad_norm": 0.2633622884750366,
      "kl": 0.53759765625,
      "learning_rate": 9.695457105469806e-05,
      "loss": 0.066,
      "reward": 0.7318950295448303,
      "reward_std": 0.4617820382118225,
      "rewards/cosine_scaled_reward": -0.13405249640345573,
      "rewards/format_reward": 1.0,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2902.7084350585938,
      "epoch": 0.06,
      "grad_norm": 0.4247024953365326,
      "kl": 0.486328125,
      "learning_rate": 9.683994186497132e-05,
      "loss": 0.0147,
      "reward": 0.2280621938407421,
      "reward_std": 0.4256502091884613,
      "rewards/cosine_scaled_reward": -0.2401355840265751,
      "rewards/format_reward": 0.7083333544433117,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2954.041717529297,
      "epoch": 0.060571428571428575,
      "grad_norm": 0.350850909948349,
      "kl": 0.587646484375,
      "learning_rate": 9.672327345550543e-05,
      "loss": 0.0095,
      "reward": 0.3893072069622576,
      "reward_std": 0.7964539304375648,
      "rewards/cosine_scaled_reward": -0.09701308235526085,
      "rewards/format_reward": 0.5833333395421505,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2843.166717529297,
      "epoch": 0.061142857142857145,
      "grad_norm": 0.2774275839328766,
      "kl": 0.3834228515625,
      "learning_rate": 9.66045715125541e-05,
      "loss": 0.0528,
      "reward": 0.12887566909193993,
      "reward_std": 0.21474172547459602,
      "rewards/cosine_scaled_reward": -0.22722883895039558,
      "rewards/format_reward": 0.5833333358168602,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2895.0000610351562,
      "epoch": 0.061714285714285715,
      "grad_norm": 0.3574961721897125,
      "kl": 0.34130859375,
      "learning_rate": 9.648384182148252e-05,
      "loss": -0.0363,
      "reward": 0.0623416006565094,
      "reward_std": 0.5782450139522552,
      "rewards/cosine_scaled_reward": -0.3021625205874443,
      "rewards/format_reward": 0.6666666679084301,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3446.9583740234375,
      "epoch": 0.062285714285714285,
      "grad_norm": 0.3407225012779236,
      "kl": 0.269287109375,
      "learning_rate": 9.636109026648555e-05,
      "loss": 0.0503,
      "reward": 0.2119973637163639,
      "reward_std": 0.6676298193633556,
      "rewards/cosine_scaled_reward": -0.08150134235620499,
      "rewards/format_reward": 0.3750000111758709,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3102.0,
      "epoch": 0.06285714285714286,
      "grad_norm": 0.36664795875549316,
      "kl": 0.21533203125,
      "learning_rate": 9.623632283030079e-05,
      "loss": -0.0268,
      "reward": -0.14389869943261147,
      "reward_std": 0.4256294723600149,
      "rewards/cosine_scaled_reward": -0.21778268064372241,
      "rewards/format_reward": 0.2916666679084301,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2798.1667098999023,
      "epoch": 0.06342857142857143,
      "grad_norm": 0.19732221961021423,
      "kl": 0.21051025390625,
      "learning_rate": 9.610954559391703e-05,
      "loss": 0.0356,
      "reward": -0.028341025114059448,
      "reward_std": 0.41592887230217457,
      "rewards/cosine_scaled_reward": -0.16000381857156754,
      "rewards/format_reward": 0.2916666679084301,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3370.5416870117188,
      "epoch": 0.064,
      "grad_norm": 0.23629334568977356,
      "kl": 0.22021484375,
      "learning_rate": 9.598076473627798e-05,
      "loss": 0.0157,
      "reward": -0.1737063229084015,
      "reward_std": 0.5679741557687521,
      "rewards/cosine_scaled_reward": -0.27435317635536194,
      "rewards/format_reward": 0.375,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3097.250030517578,
      "epoch": 0.06457142857142857,
      "grad_norm": 0.20911185443401337,
      "kl": 0.1754150390625,
      "learning_rate": 9.58499865339809e-05,
      "loss": -0.0049,
      "reward": -0.31149674439802766,
      "reward_std": 0.4890762511640787,
      "rewards/cosine_scaled_reward": -0.3432483784854412,
      "rewards/format_reward": 0.375,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3010.5834045410156,
      "epoch": 0.06514285714285714,
      "grad_norm": 0.1976766437292099,
      "kl": 0.11474609375,
      "learning_rate": 9.571721736097089e-05,
      "loss": -0.0025,
      "reward": -0.04094894975423813,
      "reward_std": 0.6383696794509888,
      "rewards/cosine_scaled_reward": -0.22880780510604382,
      "rewards/format_reward": 0.4166666716337204,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2820.5834350585938,
      "epoch": 0.06571428571428571,
      "grad_norm": 0.13228124380111694,
      "kl": 0.1092529296875,
      "learning_rate": 9.558246368823013e-05,
      "loss": 0.0317,
      "reward": 0.16390416398644447,
      "reward_std": 0.8831267654895782,
      "rewards/cosine_scaled_reward": -0.23054791847243905,
      "rewards/format_reward": 0.6250000149011612,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3487.75,
      "epoch": 0.06628571428571428,
      "grad_norm": 0.13215263187885284,
      "kl": 0.14013671875,
      "learning_rate": 9.544573208346253e-05,
      "loss": 0.0268,
      "reward": -0.4231237219646573,
      "reward_std": 0.5401189308613539,
      "rewards/cosine_scaled_reward": -0.2948951981961727,
      "rewards/format_reward": 0.1666666679084301,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3039.0833435058594,
      "epoch": 0.06685714285714285,
      "grad_norm": 0.16131699085235596,
      "kl": 0.1080322265625,
      "learning_rate": 9.530702921077358e-05,
      "loss": -0.0214,
      "reward": 0.22934666275978088,
      "reward_std": 0.5350189581513405,
      "rewards/cosine_scaled_reward": -0.03115999698638916,
      "rewards/format_reward": 0.2916666679084301,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2498.0000915527344,
      "epoch": 0.06742857142857143,
      "grad_norm": 0.3315119743347168,
      "kl": 0.0802001953125,
      "learning_rate": 9.516636183034565e-05,
      "loss": 0.0695,
      "reward": 0.9933711041230708,
      "reward_std": 1.1258303076028824,
      "rewards/cosine_scaled_reward": 0.14251888822764158,
      "rewards/format_reward": 0.7083333432674408,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2174.25,
      "epoch": 0.068,
      "grad_norm": 0.4764270484447479,
      "kl": 0.1552734375,
      "learning_rate": 9.50237367981084e-05,
      "loss": -0.011,
      "reward": 0.7270113378763199,
      "reward_std": 0.5169851435348392,
      "rewards/cosine_scaled_reward": 0.11350566893815994,
      "rewards/format_reward": 0.5,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3331.7084350585938,
      "epoch": 0.06857142857142857,
      "grad_norm": 0.23218177258968353,
      "kl": 0.08642578125,
      "learning_rate": 9.487916106540466e-05,
      "loss": 0.0861,
      "reward": 0.3240978792309761,
      "reward_std": 0.4474998824298382,
      "rewards/cosine_scaled_reward": -0.004617743194103241,
      "rewards/format_reward": 0.3333333358168602,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2834.0416717529297,
      "epoch": 0.06914285714285714,
      "grad_norm": 0.11969159543514252,
      "kl": 0.096771240234375,
      "learning_rate": 9.473264167865173e-05,
      "loss": 0.0116,
      "reward": -0.09953830391168594,
      "reward_std": 0.4128492996096611,
      "rewards/cosine_scaled_reward": -0.17476914450526237,
      "rewards/format_reward": 0.25,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2239.7084045410156,
      "epoch": 0.06971428571428571,
      "grad_norm": 0.13540911674499512,
      "kl": 0.0828094482421875,
      "learning_rate": 9.458418577899775e-05,
      "loss": -0.0462,
      "reward": 1.4352559298276901,
      "reward_std": 0.4657023213803768,
      "rewards/cosine_scaled_reward": 0.34262790158391,
      "rewards/format_reward": 0.75,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3056.8750610351562,
      "epoch": 0.07028571428571428,
      "grad_norm": 0.3198010325431824,
      "kl": 0.1041259765625,
      "learning_rate": 9.443380060197387e-05,
      "loss": 0.0968,
      "reward": 0.48675229772925377,
      "reward_std": 0.8716996815055609,
      "rewards/cosine_scaled_reward": -0.0482905525714159,
      "rewards/format_reward": 0.5833333432674408,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3280.5833740234375,
      "epoch": 0.07085714285714285,
      "grad_norm": 0.12906375527381897,
      "kl": 0.1143798828125,
      "learning_rate": 9.428149347714143e-05,
      "loss": 0.0243,
      "reward": 0.5198042392730713,
      "reward_std": 0.2846983075141907,
      "rewards/cosine_scaled_reward": 0.00990208238363266,
      "rewards/format_reward": 0.5,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2674.8333740234375,
      "epoch": 0.07142857142857142,
      "grad_norm": 0.21717137098312378,
      "kl": 0.11065673828125,
      "learning_rate": 9.412727182773487e-05,
      "loss": -0.0629,
      "reward": 0.5749734938144684,
      "reward_std": 1.0196832083165646,
      "rewards/cosine_scaled_reward": 0.01665341481566429,
      "rewards/format_reward": 0.5416666716337204,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2071.125045776367,
      "epoch": 0.072,
      "grad_norm": 0.1411234438419342,
      "kl": 0.0943756103515625,
      "learning_rate": 9.397114317029975e-05,
      "loss": 0.0888,
      "reward": 0.553262030123733,
      "reward_std": 0.7625375427305698,
      "rewards/cosine_scaled_reward": 0.005797676742076874,
      "rewards/format_reward": 0.5416666679084301,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3094.000030517578,
      "epoch": 0.07257142857142856,
      "grad_norm": 0.41723886132240295,
      "kl": 0.1458740234375,
      "learning_rate": 9.381311511432659e-05,
      "loss": 0.1064,
      "reward": 0.19710233807563782,
      "reward_std": 0.8968651816248894,
      "rewards/cosine_scaled_reward": -0.06811549002304673,
      "rewards/format_reward": 0.3333333432674408,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3223.4583740234375,
      "epoch": 0.07314285714285715,
      "grad_norm": 0.4333489239215851,
      "kl": 0.151611328125,
      "learning_rate": 9.36531953618799e-05,
      "loss": 0.1332,
      "reward": 0.1836181916296482,
      "reward_std": 0.7244069799780846,
      "rewards/cosine_scaled_reward": -0.05402424931526184,
      "rewards/format_reward": 0.2916666753590107,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2923.3334350585938,
      "epoch": 0.07371428571428572,
      "grad_norm": 0.40884310007095337,
      "kl": 0.17724609375,
      "learning_rate": 9.349139170722281e-05,
      "loss": 0.0865,
      "reward": 0.1987609639763832,
      "reward_std": 0.6052179206162691,
      "rewards/cosine_scaled_reward": -0.1506195142865181,
      "rewards/format_reward": 0.5000000111758709,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3203.291748046875,
      "epoch": 0.07428571428571429,
      "grad_norm": 0.38833317160606384,
      "kl": 0.321533203125,
      "learning_rate": 9.332771203643715e-05,
      "loss": 0.1088,
      "reward": 0.03225439786911011,
      "reward_std": 0.9268941730260849,
      "rewards/cosine_scaled_reward": -0.15053946431726217,
      "rewards/format_reward": 0.3333333432674408,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2999.166717529297,
      "epoch": 0.07485714285714286,
      "grad_norm": 0.22766365110874176,
      "kl": 0.297119140625,
      "learning_rate": 9.316216432703917e-05,
      "loss": 0.022,
      "reward": 0.0023300140164792538,
      "reward_std": 0.5885078124701977,
      "rewards/cosine_scaled_reward": -0.2488350011408329,
      "rewards/format_reward": 0.5000000223517418,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2991.4584350585938,
      "epoch": 0.07542857142857143,
      "grad_norm": 0.22822511196136475,
      "kl": 0.53076171875,
      "learning_rate": 9.299475664759069e-05,
      "loss": 0.0249,
      "reward": 0.4181240275502205,
      "reward_std": 0.45995941013097763,
      "rewards/cosine_scaled_reward": -0.22843801230192184,
      "rewards/format_reward": 0.8750000149011612,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2522.5000610351562,
      "epoch": 0.076,
      "grad_norm": 0.6402227282524109,
      "kl": 0.5380859375,
      "learning_rate": 9.28254971573058e-05,
      "loss": 0.2187,
      "reward": 1.112146245315671,
      "reward_std": 0.6289810538291931,
      "rewards/cosine_scaled_reward": 0.13940642774105072,
      "rewards/format_reward": 0.8333333432674408,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2573.33349609375,
      "epoch": 0.07657142857142857,
      "grad_norm": 0.34855917096138,
      "kl": 0.744140625,
      "learning_rate": 9.265439410565329e-05,
      "loss": 0.1362,
      "reward": 0.5219381079077721,
      "reward_std": 0.6780424751341343,
      "rewards/cosine_scaled_reward": -0.13486428651958704,
      "rewards/format_reward": 0.7916666716337204,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2322.7500915527344,
      "epoch": 0.07714285714285714,
      "grad_norm": 0.3246491551399231,
      "kl": 0.55712890625,
      "learning_rate": 9.248145583195448e-05,
      "loss": 0.1347,
      "reward": 0.7710914388298988,
      "reward_std": 0.5349869206547737,
      "rewards/cosine_scaled_reward": -0.051954299211502075,
      "rewards/format_reward": 0.8750000149011612,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1618.375015258789,
      "epoch": 0.07771428571428571,
      "grad_norm": 0.2646270990371704,
      "kl": 0.3037109375,
      "learning_rate": 9.230669076497688e-05,
      "loss": 0.0348,
      "reward": 0.6177230253815651,
      "reward_std": 0.5647286213934422,
      "rewards/cosine_scaled_reward": -0.17030514776706696,
      "rewards/format_reward": 0.9583333432674408,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2437.125045776367,
      "epoch": 0.07828571428571429,
      "grad_norm": 0.3581116199493408,
      "kl": 0.67254638671875,
      "learning_rate": 9.213010742252328e-05,
      "loss": 0.1788,
      "reward": 0.2514616549015045,
      "reward_std": 0.8865174166858196,
      "rewards/cosine_scaled_reward": -0.12426918093115091,
      "rewards/format_reward": 0.5000000074505806,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2302.041717529297,
      "epoch": 0.07885714285714286,
      "grad_norm": 0.5932723879814148,
      "kl": 0.683349609375,
      "learning_rate": 9.195171441101669e-05,
      "loss": 0.2266,
      "reward": 0.48782986029982567,
      "reward_std": 0.6536561995744705,
      "rewards/cosine_scaled_reward": -0.11025174707174301,
      "rewards/format_reward": 0.7083333432674408,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2605.3334350585938,
      "epoch": 0.07942857142857143,
      "grad_norm": 0.3788131773471832,
      "kl": 1.0732421875,
      "learning_rate": 9.177152042508078e-05,
      "loss": 0.1714,
      "reward": 0.49786752462387085,
      "reward_std": 0.7835120111703873,
      "rewards/cosine_scaled_reward": -0.06356624886393547,
      "rewards/format_reward": 0.6250000223517418,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2648.7084350585938,
      "epoch": 0.08,
      "grad_norm": 0.9030252695083618,
      "kl": 1.1220703125,
      "learning_rate": 9.158953424711625e-05,
      "loss": 0.1234,
      "reward": 0.28087351471185684,
      "reward_std": 0.7202268093824387,
      "rewards/cosine_scaled_reward": -0.13039657729677856,
      "rewards/format_reward": 0.541666679084301,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2024.4167175292969,
      "epoch": 0.08057142857142857,
      "grad_norm": 1.2033867835998535,
      "kl": 0.7147216796875,
      "learning_rate": 9.140576474687264e-05,
      "loss": -0.0838,
      "reward": 0.22873285971581936,
      "reward_std": 0.2747483551502228,
      "rewards/cosine_scaled_reward": -0.17730024084448814,
      "rewards/format_reward": 0.5833333358168602,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1909.3334045410156,
      "epoch": 0.08114285714285714,
      "grad_norm": 0.3704441487789154,
      "kl": 0.80645751953125,
      "learning_rate": 9.122022088101614e-05,
      "loss": 0.0942,
      "reward": 0.4351644292473793,
      "reward_std": 0.7959753908216953,
      "rewards/cosine_scaled_reward": -0.1574177942238748,
      "rewards/format_reward": 0.7500000074505806,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2298.6250915527344,
      "epoch": 0.08171428571428571,
      "grad_norm": 0.2989516854286194,
      "kl": 0.9326171875,
      "learning_rate": 9.1032911692693e-05,
      "loss": 0.2218,
      "reward": 0.35630420781672,
      "reward_std": 0.6438803896307945,
      "rewards/cosine_scaled_reward": -0.15518124029040337,
      "rewards/format_reward": 0.666666679084301,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1725.2500305175781,
      "epoch": 0.08228571428571428,
      "grad_norm": 0.36004939675331116,
      "kl": 0.5740966796875,
      "learning_rate": 9.084384631108883e-05,
      "loss": 0.1278,
      "reward": 0.6938553377985954,
      "reward_std": 0.5497538670897484,
      "rewards/cosine_scaled_reward": -0.04890568554401398,
      "rewards/format_reward": 0.7916666716337204,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1852.3750610351562,
      "epoch": 0.08285714285714285,
      "grad_norm": 0.6046501398086548,
      "kl": 0.70703125,
      "learning_rate": 9.065303395098359e-05,
      "loss": 0.0028,
      "reward": 0.6369144171476364,
      "reward_std": 0.5210181921720505,
      "rewards/cosine_scaled_reward": -0.1398761412128806,
      "rewards/format_reward": 0.9166666865348816,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2484.916748046875,
      "epoch": 0.08342857142857144,
      "grad_norm": 0.2475971132516861,
      "kl": 0.73388671875,
      "learning_rate": 9.046048391230248e-05,
      "loss": 0.0756,
      "reward": 0.7248256802558899,
      "reward_std": 0.6479251198470592,
      "rewards/cosine_scaled_reward": -0.09592050686478615,
      "rewards/format_reward": 0.9166666865348816,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2398.166748046875,
      "epoch": 0.084,
      "grad_norm": 0.6569065451622009,
      "kl": 0.87890625,
      "learning_rate": 9.02662055796628e-05,
      "loss": -0.0233,
      "reward": 0.8563184477388859,
      "reward_std": 0.6260966360569,
      "rewards/cosine_scaled_reward": 0.07399253733456135,
      "rewards/format_reward": 0.7083333507180214,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2529.291748046875,
      "epoch": 0.08457142857142858,
      "grad_norm": 0.41076308488845825,
      "kl": 0.578125,
      "learning_rate": 9.007020842191635e-05,
      "loss": 0.0031,
      "reward": 0.7412142492830753,
      "reward_std": 0.47748080641031265,
      "rewards/cosine_scaled_reward": -0.10855956003069878,
      "rewards/format_reward": 0.9583333432674408,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3347.4166870117188,
      "epoch": 0.08514285714285715,
      "grad_norm": 0.16827178001403809,
      "kl": 0.52197265625,
      "learning_rate": 8.987250199168808e-05,
      "loss": 0.0373,
      "reward": 0.504796092864126,
      "reward_std": 0.5495826080441475,
      "rewards/cosine_scaled_reward": -0.03926862310618162,
      "rewards/format_reward": 0.5833333358168602,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2754.791717529297,
      "epoch": 0.08571428571428572,
      "grad_norm": 0.531985342502594,
      "kl": 0.2252197265625,
      "learning_rate": 8.967309592491052e-05,
      "loss": -0.098,
      "reward": 0.4745659134350717,
      "reward_std": 0.4885084852576256,
      "rewards/cosine_scaled_reward": -0.03355037793517113,
      "rewards/format_reward": 0.5416666679084301,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2840.7500915527344,
      "epoch": 0.08628571428571429,
      "grad_norm": 0.1887764185667038,
      "kl": 0.278228759765625,
      "learning_rate": 8.947199994035401e-05,
      "loss": 0.0466,
      "reward": 0.10480327904224396,
      "reward_std": 0.890035405755043,
      "rewards/cosine_scaled_reward": -0.2600983753800392,
      "rewards/format_reward": 0.6250000037252903,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3203.3750610351562,
      "epoch": 0.08685714285714285,
      "grad_norm": 0.111234150826931,
      "kl": 0.203857421875,
      "learning_rate": 8.926922383915316e-05,
      "loss": 0.0474,
      "reward": 0.10628402233123779,
      "reward_std": 0.44779110699892044,
      "rewards/cosine_scaled_reward": -0.11352465860545635,
      "rewards/format_reward": 0.3333333358168602,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3050.9166870117188,
      "epoch": 0.08742857142857142,
      "grad_norm": 0.2246592491865158,
      "kl": 0.193359375,
      "learning_rate": 8.906477750432904e-05,
      "loss": -0.0286,
      "reward": 0.7106999894604087,
      "reward_std": 0.9839373230934143,
      "rewards/cosine_scaled_reward": 0.12618330994155258,
      "rewards/format_reward": 0.4583333395421505,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3488.1250610351562,
      "epoch": 0.088,
      "grad_norm": 0.299188494682312,
      "kl": 0.29833984375,
      "learning_rate": 8.885867090030761e-05,
      "loss": 0.0391,
      "reward": -0.033790960907936096,
      "reward_std": 0.9646570086479187,
      "rewards/cosine_scaled_reward": -0.12106215953826904,
      "rewards/format_reward": 0.2083333395421505,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3418.541748046875,
      "epoch": 0.08857142857142856,
      "grad_norm": 0.15048454701900482,
      "kl": 0.1529541015625,
      "learning_rate": 8.865091407243394e-05,
      "loss": 0.0736,
      "reward": -0.3656083308160305,
      "reward_std": 0.6392169110476971,
      "rewards/cosine_scaled_reward": -0.2869708426296711,
      "rewards/format_reward": 0.2083333395421505,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2060.1250610351562,
      "epoch": 0.08914285714285715,
      "grad_norm": 0.22759364545345306,
      "kl": 0.10736083984375,
      "learning_rate": 8.844151714648274e-05,
      "loss": 0.0749,
      "reward": 1.4114615470170975,
      "reward_std": 0.6891137436032295,
      "rewards/cosine_scaled_reward": 0.24739742651581764,
      "rewards/format_reward": 0.9166666716337204,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2715.7083740234375,
      "epoch": 0.08971428571428572,
      "grad_norm": 0.1746428906917572,
      "kl": 0.19403076171875,
      "learning_rate": 8.823049032816479e-05,
      "loss": 0.104,
      "reward": -0.2559436559677124,
      "reward_std": 0.42704229010269046,
      "rewards/cosine_scaled_reward": -0.3363051738124341,
      "rewards/format_reward": 0.4166666716337204,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2920.7500610351562,
      "epoch": 0.09028571428571429,
      "grad_norm": 0.1731269210577011,
      "kl": 0.09033203125,
      "learning_rate": 8.801784390262944e-05,
      "loss": 0.0437,
      "reward": 0.4546380043029785,
      "reward_std": 0.8263214789330959,
      "rewards/cosine_scaled_reward": -0.08518100716173649,
      "rewards/format_reward": 0.6250000149011612,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3434.666748046875,
      "epoch": 0.09085714285714286,
      "grad_norm": 0.17278991639614105,
      "kl": 0.157958984375,
      "learning_rate": 8.780358823396352e-05,
      "loss": 0.0737,
      "reward": -0.6269425004720688,
      "reward_std": 0.387746115680784,
      "rewards/cosine_scaled_reward": -0.3968045935034752,
      "rewards/format_reward": 0.1666666716337204,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3551.75,
      "epoch": 0.09142857142857143,
      "grad_norm": 0.2092837542295456,
      "kl": 0.26220703125,
      "learning_rate": 8.758773376468606e-05,
      "loss": 0.0227,
      "reward": -0.5527655929327011,
      "reward_std": 0.5685544777661562,
      "rewards/cosine_scaled_reward": -0.31804945319890976,
      "rewards/format_reward": 0.0833333358168602,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3314.8333740234375,
      "epoch": 0.092,
      "grad_norm": 0.18133918941020966,
      "kl": 0.1513671875,
      "learning_rate": 8.73702910152393e-05,
      "loss": 0.0928,
      "reward": 0.07757844589650631,
      "reward_std": 1.0933372657746077,
      "rewards/cosine_scaled_reward": -0.06537744263187051,
      "rewards/format_reward": 0.2083333395421505,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3062.8750610351562,
      "epoch": 0.09257142857142857,
      "grad_norm": 0.18608345091342926,
      "kl": 0.1959228515625,
      "learning_rate": 8.715127058347615e-05,
      "loss": -0.0589,
      "reward": 0.19248175248503685,
      "reward_std": 0.7195432111620903,
      "rewards/cosine_scaled_reward": -0.09125912375748158,
      "rewards/format_reward": 0.3750000037252903,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3412.6250610351562,
      "epoch": 0.09314285714285714,
      "grad_norm": 0.19522501528263092,
      "kl": 0.1484375,
      "learning_rate": 8.693068314414344e-05,
      "loss": 0.0603,
      "reward": -0.07296949997544289,
      "reward_std": 0.6726852059364319,
      "rewards/cosine_scaled_reward": -0.16148475895170122,
      "rewards/format_reward": 0.2500000037252903,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3415.5833740234375,
      "epoch": 0.09371428571428571,
      "grad_norm": 0.16382084786891937,
      "kl": 0.23876953125,
      "learning_rate": 8.670853944836176e-05,
      "loss": 0.0688,
      "reward": 0.06261083483695984,
      "reward_std": 0.6757630333304405,
      "rewards/cosine_scaled_reward": -0.11452792584896088,
      "rewards/format_reward": 0.2916666716337204,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3390.25,
      "epoch": 0.09428571428571429,
      "grad_norm": 0.13250745832920074,
      "kl": 0.18994140625,
      "learning_rate": 8.648485032310145e-05,
      "loss": 0.0562,
      "reward": -0.657595120370388,
      "reward_std": 0.1904697474092245,
      "rewards/cosine_scaled_reward": -0.4121309034526348,
      "rewards/format_reward": 0.1666666716337204,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3296.9583740234375,
      "epoch": 0.09485714285714286,
      "grad_norm": 0.1458793729543686,
      "kl": 0.1710205078125,
      "learning_rate": 8.625962667065488e-05,
      "loss": 0.0482,
      "reward": 0.298754021525383,
      "reward_std": 0.43109990283846855,
      "rewards/cosine_scaled_reward": -0.0172896608710289,
      "rewards/format_reward": 0.3333333358168602,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3031.4166870117188,
      "epoch": 0.09542857142857143,
      "grad_norm": 0.5354498624801636,
      "kl": 0.2393798828125,
      "learning_rate": 8.603287946810515e-05,
      "loss": 0.1521,
      "reward": 0.740845113992691,
      "reward_std": 1.2252501547336578,
      "rewards/cosine_scaled_reward": 0.12042254209518433,
      "rewards/format_reward": 0.5000000186264515,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3194.541748046875,
      "epoch": 0.096,
      "grad_norm": 0.19526343047618866,
      "kl": 0.20654296875,
      "learning_rate": 8.5804619766791e-05,
      "loss": 0.0617,
      "reward": 0.026736490428447723,
      "reward_std": 0.7318253479897976,
      "rewards/cosine_scaled_reward": -0.1324650919996202,
      "rewards/format_reward": 0.2916666716337204,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2889.2083435058594,
      "epoch": 0.09657142857142857,
      "grad_norm": 0.13189108669757843,
      "kl": 0.2628173828125,
      "learning_rate": 8.557485869176826e-05,
      "loss": 0.0467,
      "reward": 0.3621276989579201,
      "reward_std": 0.5554591603577137,
      "rewards/cosine_scaled_reward": -0.08976950496435165,
      "rewards/format_reward": 0.5416666679084301,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3473.7916870117188,
      "epoch": 0.09714285714285714,
      "grad_norm": 0.17061442136764526,
      "kl": 0.42626953125,
      "learning_rate": 8.534360744126755e-05,
      "loss": 0.0345,
      "reward": 0.26488298177719116,
      "reward_std": 0.6278351005166769,
      "rewards/cosine_scaled_reward": -0.11755852587521076,
      "rewards/format_reward": 0.5000000149011612,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3441.2918090820312,
      "epoch": 0.09771428571428571,
      "grad_norm": 0.2095242440700531,
      "kl": 0.5634765625,
      "learning_rate": 8.511087728614862e-05,
      "loss": 0.0491,
      "reward": 0.1459126800764352,
      "reward_std": 0.7411493808031082,
      "rewards/cosine_scaled_reward": -0.2603769972920418,
      "rewards/format_reward": 0.6666666865348816,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3050.1250610351562,
      "epoch": 0.09828571428571428,
      "grad_norm": 0.3047703504562378,
      "kl": 0.354248046875,
      "learning_rate": 8.487667956935088e-05,
      "loss": 0.1551,
      "reward": 0.09439549967646599,
      "reward_std": 0.3907792381942272,
      "rewards/cosine_scaled_reward": -0.244468929246068,
      "rewards/format_reward": 0.5833333358168602,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3115.4583740234375,
      "epoch": 0.09885714285714285,
      "grad_norm": 0.22464460134506226,
      "kl": 0.40625,
      "learning_rate": 8.464102570534061e-05,
      "loss": 0.0295,
      "reward": 0.10012460593134165,
      "reward_std": 0.3750727055594325,
      "rewards/cosine_scaled_reward": -0.17910437239333987,
      "rewards/format_reward": 0.4583333432674408,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3400.25,
      "epoch": 0.09942857142857142,
      "grad_norm": 0.17063897848129272,
      "kl": 0.4248046875,
      "learning_rate": 8.440392717955476e-05,
      "loss": 0.0536,
      "reward": 0.10912856008508243,
      "reward_std": 0.19460038893157616,
      "rewards/cosine_scaled_reward": -0.11210238412604667,
      "rewards/format_reward": 0.3333333432674408,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2890.0000610351562,
      "epoch": 0.1,
      "grad_norm": 0.2909056842327118,
      "kl": 0.73583984375,
      "learning_rate": 8.416539554784089e-05,
      "loss": 0.1191,
      "reward": 0.5186500549316406,
      "reward_std": 0.5995163694024086,
      "rewards/cosine_scaled_reward": -0.13650833815336227,
      "rewards/format_reward": 0.7916666865348816,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3099.3333740234375,
      "epoch": 0.10057142857142858,
      "grad_norm": 0.38485434651374817,
      "kl": 0.67333984375,
      "learning_rate": 8.392544243589427e-05,
      "loss": 0.0273,
      "reward": 0.32382382079958916,
      "reward_std": 0.3924727290868759,
      "rewards/cosine_scaled_reward": -0.2339214440435171,
      "rewards/format_reward": 0.791666679084301,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2768.291748046875,
      "epoch": 0.10114285714285715,
      "grad_norm": 0.3035656213760376,
      "kl": 0.539306640625,
      "learning_rate": 8.368407953869104e-05,
      "loss": 0.0805,
      "reward": 1.056127205491066,
      "reward_std": 0.9225019067525864,
      "rewards/cosine_scaled_reward": 0.02806359902024269,
      "rewards/format_reward": 1.0,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3217.791748046875,
      "epoch": 0.10171428571428572,
      "grad_norm": 0.38410308957099915,
      "kl": 0.7138671875,
      "learning_rate": 8.34413186199183e-05,
      "loss": 0.0429,
      "reward": 0.2634657397866249,
      "reward_std": 0.4688725695014,
      "rewards/cosine_scaled_reward": -0.2432671394199133,
      "rewards/format_reward": 0.7500000149011612,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2783.1250610351562,
      "epoch": 0.10228571428571429,
      "grad_norm": 0.2818540632724762,
      "kl": 0.532958984375,
      "learning_rate": 8.319717151140073e-05,
      "loss": 0.0169,
      "reward": 1.0860532075166702,
      "reward_std": 0.5036949962377548,
      "rewards/cosine_scaled_reward": 0.08469325304031372,
      "rewards/format_reward": 0.9166666865348816,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3131.8333740234375,
      "epoch": 0.10285714285714286,
      "grad_norm": 0.1775483340024948,
      "kl": 0.7109375,
      "learning_rate": 8.295165011252397e-05,
      "loss": 0.0516,
      "reward": 0.6567038595676422,
      "reward_std": 0.8435460180044174,
      "rewards/cosine_scaled_reward": -0.10914808511734009,
      "rewards/format_reward": 0.8750000298023224,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3148.1250610351562,
      "epoch": 0.10342857142857143,
      "grad_norm": 0.233117014169693,
      "kl": 0.7314453125,
      "learning_rate": 8.270476638965462e-05,
      "loss": 0.0481,
      "reward": 0.3657357878983021,
      "reward_std": 0.47509450232610106,
      "rewards/cosine_scaled_reward": -0.17129878513514996,
      "rewards/format_reward": 0.7083333432674408,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3470.4166870117188,
      "epoch": 0.104,
      "grad_norm": 0.3096827268600464,
      "kl": 0.49169921875,
      "learning_rate": 8.245653237555706e-05,
      "loss": 0.0023,
      "reward": 1.0028938204050064,
      "reward_std": 0.3635401166975498,
      "rewards/cosine_scaled_reward": 0.10561357298865914,
      "rewards/format_reward": 0.7916666716337204,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3170.916748046875,
      "epoch": 0.10457142857142857,
      "grad_norm": 0.4448356032371521,
      "kl": 0.68212890625,
      "learning_rate": 8.220696016880688e-05,
      "loss": 0.0353,
      "reward": 0.3080389183014631,
      "reward_std": 0.3628546576946974,
      "rewards/cosine_scaled_reward": -0.20014722365885973,
      "rewards/format_reward": 0.7083333432674408,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2907.2500610351562,
      "epoch": 0.10514285714285715,
      "grad_norm": 0.2639898657798767,
      "kl": 0.619140625,
      "learning_rate": 8.195606193320136e-05,
      "loss": 0.112,
      "reward": 0.39144587703049183,
      "reward_std": 0.5591121315956116,
      "rewards/cosine_scaled_reward": -0.17927706986665726,
      "rewards/format_reward": 0.7500000111758709,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2645.5833740234375,
      "epoch": 0.10571428571428572,
      "grad_norm": 0.3939441442489624,
      "kl": 0.41021728515625,
      "learning_rate": 8.170384989716657e-05,
      "loss": 0.0815,
      "reward": 0.39359497651457787,
      "reward_std": 0.30915623903274536,
      "rewards/cosine_scaled_reward": -0.11570251337252557,
      "rewards/format_reward": 0.6250000111758709,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3049.291748046875,
      "epoch": 0.10628571428571429,
      "grad_norm": 0.180860236287117,
      "kl": 0.39794921875,
      "learning_rate": 8.14503363531613e-05,
      "loss": 0.0304,
      "reward": 0.6096877008676529,
      "reward_std": 0.5758876949548721,
      "rewards/cosine_scaled_reward": -0.04932282119989395,
      "rewards/format_reward": 0.7083333544433117,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3548.5833740234375,
      "epoch": 0.10685714285714286,
      "grad_norm": 0.1805480420589447,
      "kl": 0.51318359375,
      "learning_rate": 8.119553365707803e-05,
      "loss": 0.0357,
      "reward": -0.022652635350823402,
      "reward_std": 0.3080403096973896,
      "rewards/cosine_scaled_reward": -0.15715964138507843,
      "rewards/format_reward": 0.2916666753590107,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3463.8750610351562,
      "epoch": 0.10742857142857143,
      "grad_norm": 0.19337642192840576,
      "kl": 0.5703125,
      "learning_rate": 8.09394542276407e-05,
      "loss": 0.0474,
      "reward": 0.056963276118040085,
      "reward_std": 0.4269789531826973,
      "rewards/cosine_scaled_reward": -0.26318503729999065,
      "rewards/format_reward": 0.5833333544433117,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3522.1251220703125,
      "epoch": 0.108,
      "grad_norm": 0.17618711292743683,
      "kl": 0.47119140625,
      "learning_rate": 8.068211054579944e-05,
      "loss": 0.0353,
      "reward": 0.017235335893929005,
      "reward_std": 0.6464042738080025,
      "rewards/cosine_scaled_reward": -0.19971567392349243,
      "rewards/format_reward": 0.416666679084301,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3556.5416870117188,
      "epoch": 0.10857142857142857,
      "grad_norm": 0.20488235354423523,
      "kl": 0.4287109375,
      "learning_rate": 8.042351515412221e-05,
      "loss": 0.0113,
      "reward": -0.17852318100631237,
      "reward_std": 0.226779380813241,
      "rewards/cosine_scaled_reward": -0.27676159143447876,
      "rewards/format_reward": 0.3750000037252903,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2325.791702270508,
      "epoch": 0.10914285714285714,
      "grad_norm": 0.19414982199668884,
      "kl": 0.22698974609375,
      "learning_rate": 8.016368065618361e-05,
      "loss": 0.029,
      "reward": 0.07766161113977432,
      "reward_std": 0.23689523618668318,
      "rewards/cosine_scaled_reward": -0.25283585488796234,
      "rewards/format_reward": 0.5833333358168602,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3216.5416870117188,
      "epoch": 0.10971428571428571,
      "grad_norm": 0.12748228013515472,
      "kl": 0.3115234375,
      "learning_rate": 7.99026197159505e-05,
      "loss": 0.0574,
      "reward": 0.4967636591754854,
      "reward_std": 0.3701250050216913,
      "rewards/cosine_scaled_reward": 0.08171516214497387,
      "rewards/format_reward": 0.3333333358168602,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3243.0416870117188,
      "epoch": 0.11028571428571429,
      "grad_norm": 0.3466835021972656,
      "kl": 0.357666015625,
      "learning_rate": 7.964034505716477e-05,
      "loss": 0.1503,
      "reward": -0.2960766963660717,
      "reward_std": 0.3008538093417883,
      "rewards/cosine_scaled_reward": -0.2313716821372509,
      "rewards/format_reward": 0.1666666679084301,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2656.75,
      "epoch": 0.11085714285714286,
      "grad_norm": 0.26660433411598206,
      "kl": 0.1654052734375,
      "learning_rate": 7.93768694627233e-05,
      "loss": 0.1287,
      "reward": 0.832207377650775,
      "reward_std": 0.4687335812486708,
      "rewards/cosine_scaled_reward": 0.2286036813748069,
      "rewards/format_reward": 0.375,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3484.2083740234375,
      "epoch": 0.11142857142857143,
      "grad_norm": 0.1703551560640335,
      "kl": 0.350830078125,
      "learning_rate": 7.911220577405484e-05,
      "loss": 0.0664,
      "reward": -0.10861442796885967,
      "reward_std": 0.48796410858631134,
      "rewards/cosine_scaled_reward": -0.22097388468682766,
      "rewards/format_reward": 0.3333333358168602,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2822.5833435058594,
      "epoch": 0.112,
      "grad_norm": 0.14060811698436737,
      "kl": 0.263519287109375,
      "learning_rate": 7.884636689049423e-05,
      "loss": 0.0503,
      "reward": 0.03914413973689079,
      "reward_std": 0.42792966961860657,
      "rewards/cosine_scaled_reward": -0.12626126781105995,
      "rewards/format_reward": 0.2916666679084301,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3313.4583740234375,
      "epoch": 0.11257142857142857,
      "grad_norm": 0.266854852437973,
      "kl": 0.326171875,
      "learning_rate": 7.857936576865357e-05,
      "loss": 0.0504,
      "reward": 0.07123216986656189,
      "reward_std": 0.42342435102909803,
      "rewards/cosine_scaled_reward": -0.08938391506671906,
      "rewards/format_reward": 0.25,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2759.9583740234375,
      "epoch": 0.11314285714285714,
      "grad_norm": 0.3209151029586792,
      "kl": 0.31805419921875,
      "learning_rate": 7.831121542179087e-05,
      "loss": 0.1378,
      "reward": 0.6144078075885773,
      "reward_std": 0.735694158822298,
      "rewards/cosine_scaled_reward": 0.015537269413471222,
      "rewards/format_reward": 0.5833333432674408,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3151.916748046875,
      "epoch": 0.11371428571428571,
      "grad_norm": 0.1781582534313202,
      "kl": 0.284912109375,
      "learning_rate": 7.804192891917572e-05,
      "loss": 0.0324,
      "reward": 0.5205724835395813,
      "reward_std": 0.6832853183150291,
      "rewards/cosine_scaled_reward": -0.13554711267352104,
      "rewards/format_reward": 0.7916666716337204,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3056.8333740234375,
      "epoch": 0.11428571428571428,
      "grad_norm": 0.1799694299697876,
      "kl": 0.3466796875,
      "learning_rate": 7.777151938545237e-05,
      "loss": 0.0166,
      "reward": 0.2738894410431385,
      "reward_std": 0.6837660204619169,
      "rewards/cosine_scaled_reward": -0.11305528320372105,
      "rewards/format_reward": 0.5000000223517418,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3472.1250610351562,
      "epoch": 0.11485714285714285,
      "grad_norm": 0.14525848627090454,
      "kl": 0.32275390625,
      "learning_rate": 7.75e-05,
      "loss": 0.0428,
      "reward": 0.08027667133137584,
      "reward_std": 0.3925677575170994,
      "rewards/cosine_scaled_reward": -0.08486166223883629,
      "rewards/format_reward": 0.2500000037252903,
      "step": 201
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2908.4583740234375,
      "epoch": 0.11542857142857142,
      "grad_norm": 0.19700901210308075,
      "kl": 0.452392578125,
      "learning_rate": 7.72273839962904e-05,
      "loss": 0.0939,
      "reward": 0.23302962351590395,
      "reward_std": 0.5167482197284698,
      "rewards/cosine_scaled_reward": -0.19598520174622536,
      "rewards/format_reward": 0.6250000111758709,
      "step": 202
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3359.9584350585938,
      "epoch": 0.116,
      "grad_norm": 0.2025863081216812,
      "kl": 0.5537109375,
      "learning_rate": 7.695368466124298e-05,
      "loss": 0.0524,
      "reward": 0.5999528877437115,
      "reward_std": 0.7707385122776031,
      "rewards/cosine_scaled_reward": -0.07502354681491852,
      "rewards/format_reward": 0.7500000223517418,
      "step": 203
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3285.5416870117188,
      "epoch": 0.11657142857142858,
      "grad_norm": 0.19257992506027222,
      "kl": 0.386962890625,
      "learning_rate": 7.667891533457719e-05,
      "loss": 0.0554,
      "reward": 0.1710458118468523,
      "reward_std": 0.3957900758832693,
      "rewards/cosine_scaled_reward": -0.20614376943558455,
      "rewards/format_reward": 0.5833333432674408,
      "step": 204
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3412.166748046875,
      "epoch": 0.11714285714285715,
      "grad_norm": 0.3564988076686859,
      "kl": 0.58837890625,
      "learning_rate": 7.64030894081624e-05,
      "loss": 0.0441,
      "reward": 0.17527301236987114,
      "reward_std": 0.34667395800352097,
      "rewards/cosine_scaled_reward": -0.2665301710367203,
      "rewards/format_reward": 0.7083333358168602,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3419.9583740234375,
      "epoch": 0.11771428571428572,
      "grad_norm": 0.22991527616977692,
      "kl": 0.48876953125,
      "learning_rate": 7.612622032536509e-05,
      "loss": 0.0659,
      "reward": 0.006236948072910309,
      "reward_std": 0.43261654675006866,
      "rewards/cosine_scaled_reward": -0.26771486178040504,
      "rewards/format_reward": 0.5416666679084301,
      "step": 206
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3140.3750610351562,
      "epoch": 0.11828571428571429,
      "grad_norm": 0.2047661691904068,
      "kl": 0.6416015625,
      "learning_rate": 7.58483215803938e-05,
      "loss": 0.0602,
      "reward": 0.8695274218916893,
      "reward_std": 0.3878786191344261,
      "rewards/cosine_scaled_reward": 0.05976368859410286,
      "rewards/format_reward": 0.7500000074505806,
      "step": 207
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3045.5416870117188,
      "epoch": 0.11885714285714286,
      "grad_norm": 0.39017602801322937,
      "kl": 0.60546875,
      "learning_rate": 7.556940671764125e-05,
      "loss": 0.0101,
      "reward": 0.6794271823018789,
      "reward_std": 0.43701300024986267,
      "rewards/cosine_scaled_reward": -0.035286422818899155,
      "rewards/format_reward": 0.75,
      "step": 208
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2835.4583740234375,
      "epoch": 0.11942857142857143,
      "grad_norm": 0.24870344996452332,
      "kl": 0.4085693359375,
      "learning_rate": 7.52894893310244e-05,
      "loss": 0.1686,
      "reward": 0.6219545120256953,
      "reward_std": 0.4378160387277603,
      "rewards/cosine_scaled_reward": -0.001522757112979889,
      "rewards/format_reward": 0.6250000074505806,
      "step": 209
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3224.666748046875,
      "epoch": 0.12,
      "grad_norm": 0.18834112584590912,
      "kl": 0.361572265625,
      "learning_rate": 7.500858306332173e-05,
      "loss": 0.0575,
      "reward": 0.139042385853827,
      "reward_std": 0.2909255549311638,
      "rewards/cosine_scaled_reward": -0.20131214708089828,
      "rewards/format_reward": 0.541666679084301,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2521.500030517578,
      "epoch": 0.12057142857142857,
      "grad_norm": 0.1810249388217926,
      "kl": 0.42181396484375,
      "learning_rate": 7.472670160550849e-05,
      "loss": 0.0523,
      "reward": 0.6465329900383949,
      "reward_std": 0.5430318973958492,
      "rewards/cosine_scaled_reward": -0.11423350404947996,
      "rewards/format_reward": 0.875,
      "step": 211
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3383.3750610351562,
      "epoch": 0.12114285714285715,
      "grad_norm": 0.17248566448688507,
      "kl": 0.30029296875,
      "learning_rate": 7.444385869608922e-05,
      "loss": 0.0381,
      "reward": 0.07134938985109329,
      "reward_std": 0.43245443142950535,
      "rewards/cosine_scaled_reward": -0.11015863250941038,
      "rewards/format_reward": 0.291666679084301,
      "step": 212
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2899.6251220703125,
      "epoch": 0.12171428571428572,
      "grad_norm": 0.16710422933101654,
      "kl": 0.6005859375,
      "learning_rate": 7.416006812042828e-05,
      "loss": 0.0639,
      "reward": 0.9749854728579521,
      "reward_std": 0.8126206174492836,
      "rewards/cosine_scaled_reward": 0.008326039183884859,
      "rewards/format_reward": 0.9583333432674408,
      "step": 213
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2762.0416870117188,
      "epoch": 0.12228571428571429,
      "grad_norm": 0.13040971755981445,
      "kl": 0.37322998046875,
      "learning_rate": 7.387534371007797e-05,
      "loss": 0.0808,
      "reward": 0.23875866644084454,
      "reward_std": 0.4626100994646549,
      "rewards/cosine_scaled_reward": -0.19312065839767456,
      "rewards/format_reward": 0.6250000037252903,
      "step": 214
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3368.4166870117188,
      "epoch": 0.12285714285714286,
      "grad_norm": 0.22608883678913116,
      "kl": 0.642578125,
      "learning_rate": 7.358969934210438e-05,
      "loss": 0.0332,
      "reward": 0.07540189661085606,
      "reward_std": 0.31054434925317764,
      "rewards/cosine_scaled_reward": -0.31646573543548584,
      "rewards/format_reward": 0.7083333656191826,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1938.416748046875,
      "epoch": 0.12342857142857143,
      "grad_norm": 0.6437280178070068,
      "kl": 0.2008056640625,
      "learning_rate": 7.330314893841101e-05,
      "loss": -0.1161,
      "reward": 1.7220262214541435,
      "reward_std": 0.25383612513542175,
      "rewards/cosine_scaled_reward": 0.3818464130163193,
      "rewards/format_reward": 0.9583333432674408,
      "step": 216
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3377.7501220703125,
      "epoch": 0.124,
      "grad_norm": 0.18521635234355927,
      "kl": 0.61376953125,
      "learning_rate": 7.301570646506028e-05,
      "loss": 0.0548,
      "reward": 0.4902285588905215,
      "reward_std": 0.473490871489048,
      "rewards/cosine_scaled_reward": -0.1507190652191639,
      "rewards/format_reward": 0.791666679084301,
      "step": 217
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2740.458465576172,
      "epoch": 0.12457142857142857,
      "grad_norm": 0.47241246700286865,
      "kl": 0.3935546875,
      "learning_rate": 7.27273859315928e-05,
      "loss": 0.1019,
      "reward": 0.30729155242443085,
      "reward_std": 0.7103235945105553,
      "rewards/cosine_scaled_reward": -0.20052088797092438,
      "rewards/format_reward": 0.7083333507180214,
      "step": 218
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2876.7500610351562,
      "epoch": 0.12514285714285714,
      "grad_norm": 0.3306594789028168,
      "kl": 0.342529296875,
      "learning_rate": 7.243820139034464e-05,
      "loss": 0.0972,
      "reward": 0.5614618342369795,
      "reward_std": 0.49163829535245895,
      "rewards/cosine_scaled_reward": -0.11510240286588669,
      "rewards/format_reward": 0.7916666679084301,
      "step": 219
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2676.0000610351562,
      "epoch": 0.12571428571428572,
      "grad_norm": 0.3193364441394806,
      "kl": 0.3392333984375,
      "learning_rate": 7.214816693576235e-05,
      "loss": 0.0709,
      "reward": 1.6334106158465147,
      "reward_std": 0.7052293419837952,
      "rewards/cosine_scaled_reward": 0.33753862231969833,
      "rewards/format_reward": 0.9583333432674408,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2236.1666870117188,
      "epoch": 0.12628571428571428,
      "grad_norm": 0.2019508183002472,
      "kl": 0.35107421875,
      "learning_rate": 7.185729670371605e-05,
      "loss": -0.0156,
      "reward": 1.01932243257761,
      "reward_std": 0.6837839931249619,
      "rewards/cosine_scaled_reward": 0.009661169722676277,
      "rewards/format_reward": 1.0,
      "step": 221
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2180.4583435058594,
      "epoch": 0.12685714285714286,
      "grad_norm": 1.0214191675186157,
      "kl": 0.2244873046875,
      "learning_rate": 7.156560487081053e-05,
      "loss": 0.2367,
      "reward": 0.6144461743533611,
      "reward_std": 0.5636948570609093,
      "rewards/cosine_scaled_reward": -0.04694357141852379,
      "rewards/format_reward": 0.7083333544433117,
      "step": 222
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2938.791717529297,
      "epoch": 0.12742857142857142,
      "grad_norm": 0.20985785126686096,
      "kl": 0.47314453125,
      "learning_rate": 7.127310565369415e-05,
      "loss": 0.0899,
      "reward": 0.16650558728724718,
      "reward_std": 0.48911314830183983,
      "rewards/cosine_scaled_reward": -0.27091389521956444,
      "rewards/format_reward": 0.7083333432674408,
      "step": 223
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3043.2916870117188,
      "epoch": 0.128,
      "grad_norm": 0.1921759396791458,
      "kl": 0.6591796875,
      "learning_rate": 7.097981330836617e-05,
      "loss": 0.055,
      "reward": 0.8815609216690063,
      "reward_std": 0.6539080664515495,
      "rewards/cosine_scaled_reward": 0.0032804161310195923,
      "rewards/format_reward": 0.8750000298023224,
      "step": 224
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2789.3333740234375,
      "epoch": 0.12857142857142856,
      "grad_norm": 0.2785404324531555,
      "kl": 0.646484375,
      "learning_rate": 7.068574212948169e-05,
      "loss": 0.0309,
      "reward": 0.5741224549710751,
      "reward_std": 0.7463614344596863,
      "rewards/cosine_scaled_reward": -0.15043878043070436,
      "rewards/format_reward": 0.8750000149011612,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2685.666778564453,
      "epoch": 0.12914285714285714,
      "grad_norm": 0.594291090965271,
      "kl": 0.492431640625,
      "learning_rate": 7.03909064496551e-05,
      "loss": -0.0623,
      "reward": 1.1779837608337402,
      "reward_std": 0.5032703503966331,
      "rewards/cosine_scaled_reward": 0.08899188553914428,
      "rewards/format_reward": 1.0,
      "step": 226
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2405.416778564453,
      "epoch": 0.12971428571428573,
      "grad_norm": 0.2714992165565491,
      "kl": 0.485107421875,
      "learning_rate": 7.009532063876149e-05,
      "loss": 0.1552,
      "reward": 0.9733416438102722,
      "reward_std": 0.9007052779197693,
      "rewards/cosine_scaled_reward": 0.028337497264146805,
      "rewards/format_reward": 0.9166666865348816,
      "step": 227
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1738.8750305175781,
      "epoch": 0.13028571428571428,
      "grad_norm": 0.484445720911026,
      "kl": 0.4476318359375,
      "learning_rate": 6.979899910323624e-05,
      "loss": 0.2133,
      "reward": 1.399103358387947,
      "reward_std": 0.7133887782692909,
      "rewards/cosine_scaled_reward": 0.1995516661554575,
      "rewards/format_reward": 1.0,
      "step": 228
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2415.3334350585938,
      "epoch": 0.13085714285714287,
      "grad_norm": 0.2949579358100891,
      "kl": 0.6171875,
      "learning_rate": 6.9501956285373e-05,
      "loss": 0.0644,
      "reward": 1.2348989397287369,
      "reward_std": 0.8904998600482941,
      "rewards/cosine_scaled_reward": 0.13828279450535774,
      "rewards/format_reward": 0.9583333432674408,
      "step": 229
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2715.791748046875,
      "epoch": 0.13142857142857142,
      "grad_norm": 0.43443575501441956,
      "kl": 0.775390625,
      "learning_rate": 6.920420666261962e-05,
      "loss": 0.0894,
      "reward": 0.7100101904943585,
      "reward_std": 0.5167043209075928,
      "rewards/cosine_scaled_reward": -0.10332825779914856,
      "rewards/format_reward": 0.9166666865348816,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2503.2500610351562,
      "epoch": 0.132,
      "grad_norm": 0.21487928926944733,
      "kl": 0.7197265625,
      "learning_rate": 6.890576474687263e-05,
      "loss": 0.0538,
      "reward": 0.5765762068331242,
      "reward_std": 0.5030670911073685,
      "rewards/cosine_scaled_reward": -0.14921192079782486,
      "rewards/format_reward": 0.8750000149011612,
      "step": 231
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2205.666748046875,
      "epoch": 0.13257142857142856,
      "grad_norm": 0.2001338005065918,
      "kl": 0.562835693359375,
      "learning_rate": 6.860664508377001e-05,
      "loss": 0.1013,
      "reward": 0.8642504140734673,
      "reward_std": 0.5591230466961861,
      "rewards/cosine_scaled_reward": -0.026208162307739258,
      "rewards/format_reward": 0.9166666865348816,
      "step": 232
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2631.7916870117188,
      "epoch": 0.13314285714285715,
      "grad_norm": 0.28648537397384644,
      "kl": 0.7060546875,
      "learning_rate": 6.83068622519821e-05,
      "loss": 0.0581,
      "reward": 0.6012575253844261,
      "reward_std": 0.369386401027441,
      "rewards/cosine_scaled_reward": -0.17853792011737823,
      "rewards/format_reward": 0.9583333432674408,
      "step": 233
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2923.541748046875,
      "epoch": 0.1337142857142857,
      "grad_norm": 0.37634986639022827,
      "kl": 0.712890625,
      "learning_rate": 6.800643086250122e-05,
      "loss": 0.0713,
      "reward": 0.8183658458292484,
      "reward_std": 0.9408960342407227,
      "rewards/cosine_scaled_reward": 0.07584960013628006,
      "rewards/format_reward": 0.6666666939854622,
      "step": 234
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2446.3334045410156,
      "epoch": 0.13428571428571429,
      "grad_norm": 0.3095211386680603,
      "kl": 0.564208984375,
      "learning_rate": 6.770536555792944e-05,
      "loss": 0.0103,
      "reward": 1.0213327407836914,
      "reward_std": 0.6788808181881905,
      "rewards/cosine_scaled_reward": 0.0314996664528735,
      "rewards/format_reward": 0.9583333432674408,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1727.4167175292969,
      "epoch": 0.13485714285714287,
      "grad_norm": 0.31641384959220886,
      "kl": 0.52777099609375,
      "learning_rate": 6.740368101176496e-05,
      "loss": 0.0146,
      "reward": 0.42938170582056046,
      "reward_std": 0.7883919924497604,
      "rewards/cosine_scaled_reward": -0.20197583828121424,
      "rewards/format_reward": 0.8333333432674408,
      "step": 236
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2093.625045776367,
      "epoch": 0.13542857142857143,
      "grad_norm": 0.2643112540245056,
      "kl": 0.35101318359375,
      "learning_rate": 6.710139192768695e-05,
      "loss": 0.0559,
      "reward": 0.4819624274969101,
      "reward_std": 0.36040157824754715,
      "rewards/cosine_scaled_reward": -0.2173521351069212,
      "rewards/format_reward": 0.9166666865348816,
      "step": 237
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2892.5834350585938,
      "epoch": 0.136,
      "grad_norm": 0.30492836236953735,
      "kl": 0.6455078125,
      "learning_rate": 6.679851303883892e-05,
      "loss": 0.0055,
      "reward": 0.8520525968633592,
      "reward_std": 1.034329280257225,
      "rewards/cosine_scaled_reward": -0.032307060435414314,
      "rewards/format_reward": 0.9166666865348816,
      "step": 238
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3010.8750610351562,
      "epoch": 0.13657142857142857,
      "grad_norm": 0.2272806316614151,
      "kl": 0.548828125,
      "learning_rate": 6.649505910711058e-05,
      "loss": 0.0823,
      "reward": 0.9699130356311798,
      "reward_std": 0.7660344392061234,
      "rewards/cosine_scaled_reward": 0.0682898610830307,
      "rewards/format_reward": 0.8333333432674408,
      "step": 239
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3037.5000610351562,
      "epoch": 0.13714285714285715,
      "grad_norm": 0.3186228573322296,
      "kl": 0.7724609375,
      "learning_rate": 6.619104492241848e-05,
      "loss": 0.0644,
      "reward": 0.2878115465864539,
      "reward_std": 0.41039496660232544,
      "rewards/cosine_scaled_reward": -0.2935942467302084,
      "rewards/format_reward": 0.8750000298023224,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2400.5834197998047,
      "epoch": 0.1377142857142857,
      "grad_norm": 0.2673218548297882,
      "kl": 0.363006591796875,
      "learning_rate": 6.588648530198504e-05,
      "loss": 0.0143,
      "reward": 1.3452494442462921,
      "reward_std": 0.922061562538147,
      "rewards/cosine_scaled_reward": 0.29762469977140427,
      "rewards/format_reward": 0.7500000223517418,
      "step": 241
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3292.416748046875,
      "epoch": 0.1382857142857143,
      "grad_norm": 0.24196265637874603,
      "kl": 0.5322265625,
      "learning_rate": 6.558139508961655e-05,
      "loss": 0.0516,
      "reward": 0.5418386338278651,
      "reward_std": 0.6776624768972397,
      "rewards/cosine_scaled_reward": -0.0832473672926426,
      "rewards/format_reward": 0.708333358168602,
      "step": 242
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3185.2084350585938,
      "epoch": 0.13885714285714285,
      "grad_norm": 0.23741839826107025,
      "kl": 0.439208984375,
      "learning_rate": 6.527578915497951e-05,
      "loss": 0.0376,
      "reward": 0.351685244590044,
      "reward_std": 0.5521262660622597,
      "rewards/cosine_scaled_reward": -0.1574907097965479,
      "rewards/format_reward": 0.6666666679084301,
      "step": 243
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2508.5417709350586,
      "epoch": 0.13942857142857143,
      "grad_norm": 0.31169164180755615,
      "kl": 0.28839111328125,
      "learning_rate": 6.496968239287605e-05,
      "loss": 0.1361,
      "reward": 0.7412183582782745,
      "reward_std": 0.305687353014946,
      "rewards/cosine_scaled_reward": -0.025224164128303528,
      "rewards/format_reward": 0.7916666679084301,
      "step": 244
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3193.5416870117188,
      "epoch": 0.14,
      "grad_norm": 0.16886374354362488,
      "kl": 0.3564453125,
      "learning_rate": 6.466308972251785e-05,
      "loss": 0.0409,
      "reward": 1.4323917776346207,
      "reward_std": 0.8325834348797798,
      "rewards/cosine_scaled_reward": 0.27869584411382675,
      "rewards/format_reward": 0.8750000149011612,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2717.25,
      "epoch": 0.14057142857142857,
      "grad_norm": 0.4018855094909668,
      "kl": 0.23583984375,
      "learning_rate": 6.435602608679918e-05,
      "loss": 0.0438,
      "reward": 0.8349570259451866,
      "reward_std": 0.6186618842184544,
      "rewards/cosine_scaled_reward": 0.12581188417971134,
      "rewards/format_reward": 0.5833333358168602,
      "step": 246
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3374.1250610351562,
      "epoch": 0.14114285714285715,
      "grad_norm": 0.25589483976364136,
      "kl": 0.42626953125,
      "learning_rate": 6.404850645156841e-05,
      "loss": 0.0487,
      "reward": 0.25039391964673996,
      "reward_std": 1.0135847851634026,
      "rewards/cosine_scaled_reward": -0.14563637599349022,
      "rewards/format_reward": 0.5416666828095913,
      "step": 247
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1883.6667175292969,
      "epoch": 0.1417142857142857,
      "grad_norm": 0.3505682349205017,
      "kl": 0.22479248046875,
      "learning_rate": 6.374054580489874e-05,
      "loss": -0.1151,
      "reward": 1.0633280351758003,
      "reward_std": 0.4256577081978321,
      "rewards/cosine_scaled_reward": 0.07333065941929817,
      "rewards/format_reward": 0.9166666716337204,
      "step": 248
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3036.7501220703125,
      "epoch": 0.1422857142857143,
      "grad_norm": 0.19629524648189545,
      "kl": 0.340576171875,
      "learning_rate": 6.343215915635762e-05,
      "loss": 0.0384,
      "reward": 0.19172463938593864,
      "reward_std": 0.6912754252552986,
      "rewards/cosine_scaled_reward": -0.2999710142612457,
      "rewards/format_reward": 0.7916666716337204,
      "step": 249
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2983.166748046875,
      "epoch": 0.14285714285714285,
      "grad_norm": 0.3300791382789612,
      "kl": 0.298583984375,
      "learning_rate": 6.31233615362752e-05,
      "loss": 0.0773,
      "reward": 0.7938342844136059,
      "reward_std": 0.745312362909317,
      "rewards/cosine_scaled_reward": 0.021917149424552917,
      "rewards/format_reward": 0.7500000149011612,
      "step": 250
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2271.0833740234375,
      "epoch": 0.14342857142857143,
      "grad_norm": 0.17075654864311218,
      "kl": 0.1722412109375,
      "learning_rate": 6.281416799501188e-05,
      "loss": 0.0039,
      "reward": 0.8436714336276054,
      "reward_std": 0.580166794359684,
      "rewards/cosine_scaled_reward": -0.03649761341512203,
      "rewards/format_reward": 0.9166666865348816,
      "step": 251
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2436.0834197998047,
      "epoch": 0.144,
      "grad_norm": 0.4659833312034607,
      "kl": 0.2823486328125,
      "learning_rate": 6.250459360222461e-05,
      "loss": 0.1394,
      "reward": 0.4135246090590954,
      "reward_std": 0.6481917910277843,
      "rewards/cosine_scaled_reward": -0.14740438014268875,
      "rewards/format_reward": 0.7083333507180214,
      "step": 252
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2777.166732788086,
      "epoch": 0.14457142857142857,
      "grad_norm": 0.15978415310382843,
      "kl": 0.37872314453125,
      "learning_rate": 6.219465344613258e-05,
      "loss": 0.0411,
      "reward": 0.45254361629486084,
      "reward_std": 0.5954790785908699,
      "rewards/cosine_scaled_reward": -0.19039486348628998,
      "rewards/format_reward": 0.8333333432674408,
      "step": 253
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2861.5000915527344,
      "epoch": 0.14514285714285713,
      "grad_norm": 0.22069796919822693,
      "kl": 0.4195556640625,
      "learning_rate": 6.188436263278172e-05,
      "loss": -0.0014,
      "reward": 0.8079469501972198,
      "reward_std": 0.8555595129728317,
      "rewards/cosine_scaled_reward": -0.054359860718250275,
      "rewards/format_reward": 0.9166666865348816,
      "step": 254
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3140.9166870117188,
      "epoch": 0.1457142857142857,
      "grad_norm": 0.3371999263763428,
      "kl": 0.51953125,
      "learning_rate": 6.157373628530852e-05,
      "loss": 0.0141,
      "reward": 0.10784337669610977,
      "reward_std": 0.4433470778167248,
      "rewards/cosine_scaled_reward": -0.2585783116519451,
      "rewards/format_reward": 0.6250000298023224,
      "step": 255
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3364.416748046875,
      "epoch": 0.1462857142857143,
      "grad_norm": 0.2070639282464981,
      "kl": 0.48974609375,
      "learning_rate": 6.126278954320295e-05,
      "loss": 0.0582,
      "reward": 0.5357752754352987,
      "reward_std": 0.6060156896710396,
      "rewards/cosine_scaled_reward": -0.16961237788200378,
      "rewards/format_reward": 0.8750000149011612,
      "step": 256
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2713.7918090820312,
      "epoch": 0.14685714285714285,
      "grad_norm": 0.3514373302459717,
      "kl": 0.355712890625,
      "learning_rate": 6.095153756157051e-05,
      "loss": 0.094,
      "reward": 1.262501284480095,
      "reward_std": 0.9425565153360367,
      "rewards/cosine_scaled_reward": 0.15208394452929497,
      "rewards/format_reward": 0.9583333432674408,
      "step": 257
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1865.4166870117188,
      "epoch": 0.14742857142857144,
      "grad_norm": 0.19031290709972382,
      "kl": 0.2869873046875,
      "learning_rate": 6.06399955103937e-05,
      "loss": 0.2014,
      "reward": 1.08034697920084,
      "reward_std": 0.29667292069643736,
      "rewards/cosine_scaled_reward": 0.10267347283661366,
      "rewards/format_reward": 0.8750000149011612,
      "step": 258
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2987.041748046875,
      "epoch": 0.148,
      "grad_norm": 0.1868191808462143,
      "kl": 0.50732421875,
      "learning_rate": 6.032817857379256e-05,
      "loss": 0.0605,
      "reward": 0.4671786054968834,
      "reward_std": 0.7795231863856316,
      "rewards/cosine_scaled_reward": -0.14141068421304226,
      "rewards/format_reward": 0.7500000298023224,
      "step": 259
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3072.0001220703125,
      "epoch": 0.14857142857142858,
      "grad_norm": 0.19339028000831604,
      "kl": 0.48876953125,
      "learning_rate": 6.001610194928464e-05,
      "loss": 0.044,
      "reward": 0.5511476572137326,
      "reward_std": 0.8131291791796684,
      "rewards/cosine_scaled_reward": -0.14109284430742264,
      "rewards/format_reward": 0.8333333432674408,
      "step": 260
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3000.041748046875,
      "epoch": 0.14914285714285713,
      "grad_norm": 0.24656662344932556,
      "kl": 0.526611328125,
      "learning_rate": 5.970378084704441e-05,
      "loss": 0.0471,
      "reward": 0.9492662008851767,
      "reward_std": 0.4657456800341606,
      "rewards/cosine_scaled_reward": 0.037133121863007545,
      "rewards/format_reward": 0.8750000149011612,
      "step": 261
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2996.041717529297,
      "epoch": 0.14971428571428572,
      "grad_norm": 0.1437804251909256,
      "kl": 0.3951416015625,
      "learning_rate": 5.9391230489161734e-05,
      "loss": 0.0074,
      "reward": 0.6824960559606552,
      "reward_std": 0.5514045283198357,
      "rewards/cosine_scaled_reward": -0.0962519608438015,
      "rewards/format_reward": 0.8750000149011612,
      "step": 262
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3299.916748046875,
      "epoch": 0.15028571428571427,
      "grad_norm": 0.860514760017395,
      "kl": 0.480712890625,
      "learning_rate": 5.907846610890012e-05,
      "loss": 0.0247,
      "reward": 0.5251449644565582,
      "reward_std": 0.5569327585399151,
      "rewards/cosine_scaled_reward": -0.17492752522230148,
      "rewards/format_reward": 0.8750000149011612,
      "step": 263
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3207.291748046875,
      "epoch": 0.15085714285714286,
      "grad_norm": 0.3078582286834717,
      "kl": 0.57763671875,
      "learning_rate": 5.876550294995421e-05,
      "loss": 0.0128,
      "reward": 0.6213351637125015,
      "reward_std": 0.8778404965996742,
      "rewards/cosine_scaled_reward": -0.04349910840392113,
      "rewards/format_reward": 0.7083333507180214,
      "step": 264
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2830.3333740234375,
      "epoch": 0.15142857142857144,
      "grad_norm": 0.17845019698143005,
      "kl": 0.364501953125,
      "learning_rate": 5.8452356265706845e-05,
      "loss": 0.0782,
      "reward": 0.6127370540052652,
      "reward_std": 0.8011298030614853,
      "rewards/cosine_scaled_reward": -0.068631486967206,
      "rewards/format_reward": 0.7500000149011612,
      "step": 265
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2766.5416870117188,
      "epoch": 0.152,
      "grad_norm": 0.31444934010505676,
      "kl": 0.2955322265625,
      "learning_rate": 5.813904131848564e-05,
      "loss": -0.0466,
      "reward": 0.3219255795702338,
      "reward_std": 0.44407689198851585,
      "rewards/cosine_scaled_reward": -0.2765372171998024,
      "rewards/format_reward": 0.8750000149011612,
      "step": 266
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1446.833396911621,
      "epoch": 0.15257142857142858,
      "grad_norm": 0.6262673139572144,
      "kl": 0.2203369140625,
      "learning_rate": 5.782557337881911e-05,
      "loss": 0.293,
      "reward": 0.44304793514311314,
      "reward_std": 0.6326870061457157,
      "rewards/cosine_scaled_reward": -0.1743093803524971,
      "rewards/format_reward": 0.7916666716337204,
      "step": 267
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2241.50008392334,
      "epoch": 0.15314285714285714,
      "grad_norm": 0.28819912672042847,
      "kl": 0.436279296875,
      "learning_rate": 5.751196772469237e-05,
      "loss": 0.1259,
      "reward": 0.5492542944848537,
      "reward_std": 0.793809786438942,
      "rewards/cosine_scaled_reward": -0.017039529979228973,
      "rewards/format_reward": 0.5833333432674408,
      "step": 268
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3128.0000610351562,
      "epoch": 0.15371428571428572,
      "grad_norm": 0.19022883474826813,
      "kl": 0.46630859375,
      "learning_rate": 5.719823964080261e-05,
      "loss": 0.0451,
      "reward": 0.4434543699026108,
      "reward_std": 0.4890909567475319,
      "rewards/cosine_scaled_reward": -0.17410616483539343,
      "rewards/format_reward": 0.7916667014360428,
      "step": 269
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2065.166778564453,
      "epoch": 0.15428571428571428,
      "grad_norm": 0.340876966714859,
      "kl": 0.368896484375,
      "learning_rate": 5.688440441781399e-05,
      "loss": -0.0992,
      "reward": 1.2790502207353711,
      "reward_std": 0.512369230389595,
      "rewards/cosine_scaled_reward": 0.18119176104664803,
      "rewards/format_reward": 0.9166666865348816,
      "step": 270
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2220.791717529297,
      "epoch": 0.15485714285714286,
      "grad_norm": 0.4013032019138336,
      "kl": 0.3543701171875,
      "learning_rate": 5.657047735161256e-05,
      "loss": -0.0809,
      "reward": 0.34419402945786715,
      "reward_std": 0.34300512447953224,
      "rewards/cosine_scaled_reward": -0.20290300622582436,
      "rewards/format_reward": 0.7500000074505806,
      "step": 271
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2219.750030517578,
      "epoch": 0.15542857142857142,
      "grad_norm": 0.35917147994041443,
      "kl": 0.238037109375,
      "learning_rate": 5.6256473742560614e-05,
      "loss": 0.1033,
      "reward": 0.7890710458159447,
      "reward_std": 0.7226014733314514,
      "rewards/cosine_scaled_reward": -0.06379782781004906,
      "rewards/format_reward": 0.9166666865348816,
      "step": 272
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2267.3334045410156,
      "epoch": 0.156,
      "grad_norm": 0.24575629830360413,
      "kl": 0.382080078125,
      "learning_rate": 5.594240889475107e-05,
      "loss": 0.0056,
      "reward": 0.7830603891052306,
      "reward_std": 0.9113038927316666,
      "rewards/cosine_scaled_reward": -0.025136479176580906,
      "rewards/format_reward": 0.8333333432674408,
      "step": 273
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3100.916748046875,
      "epoch": 0.15657142857142858,
      "grad_norm": 0.1788811832666397,
      "kl": 0.3974609375,
      "learning_rate": 5.5628298115261545e-05,
      "loss": 0.0351,
      "reward": 0.46365073323249817,
      "reward_std": 0.6709984391927719,
      "rewards/cosine_scaled_reward": -0.12234130874276161,
      "rewards/format_reward": 0.708333358168602,
      "step": 274
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2573.666778564453,
      "epoch": 0.15714285714285714,
      "grad_norm": 0.2195524126291275,
      "kl": 0.45074462890625,
      "learning_rate": 5.5314156713408275e-05,
      "loss": 0.0242,
      "reward": 0.33101664669811726,
      "reward_std": 0.6106998361647129,
      "rewards/cosine_scaled_reward": -0.18865835666656494,
      "rewards/format_reward": 0.7083333507180214,
      "step": 275
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1952.3750457763672,
      "epoch": 0.15771428571428572,
      "grad_norm": 0.4302261471748352,
      "kl": 0.19842529296875,
      "learning_rate": 5.500000000000001e-05,
      "loss": 0.1747,
      "reward": 1.8006355911493301,
      "reward_std": 0.5728400154039264,
      "rewards/cosine_scaled_reward": 0.42115106899291277,
      "rewards/format_reward": 0.9583333432674408,
      "step": 276
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2859.4584350585938,
      "epoch": 0.15828571428571428,
      "grad_norm": 0.27882683277130127,
      "kl": 0.39794921875,
      "learning_rate": 5.468584328659173e-05,
      "loss": 0.0633,
      "reward": 0.9866037368774414,
      "reward_std": 0.9637529253959656,
      "rewards/cosine_scaled_reward": 0.09746852144598961,
      "rewards/format_reward": 0.7916666865348816,
      "step": 277
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2202.75,
      "epoch": 0.15885714285714286,
      "grad_norm": 0.16858187317848206,
      "kl": 0.26458740234375,
      "learning_rate": 5.4371701884738466e-05,
      "loss": 0.1202,
      "reward": 0.39508337527513504,
      "reward_std": 0.7221878357231617,
      "rewards/cosine_scaled_reward": -0.21912500262260437,
      "rewards/format_reward": 0.8333333432674408,
      "step": 278
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2615.0000610351562,
      "epoch": 0.15942857142857142,
      "grad_norm": 0.6125884056091309,
      "kl": 0.30419921875,
      "learning_rate": 5.405759110524894e-05,
      "loss": 0.1486,
      "reward": 0.9705154225230217,
      "reward_std": 0.7704289853572845,
      "rewards/cosine_scaled_reward": 0.047757700085639954,
      "rewards/format_reward": 0.8750000298023224,
      "step": 279
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2305.1666870117188,
      "epoch": 0.16,
      "grad_norm": 0.25846433639526367,
      "kl": 0.323974609375,
      "learning_rate": 5.374352625743941e-05,
      "loss": -0.0256,
      "reward": 0.5761691424995661,
      "reward_std": 0.4631784576922655,
      "rewards/cosine_scaled_reward": -0.06608210876584053,
      "rewards/format_reward": 0.7083333358168602,
      "step": 280
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2263.4584350585938,
      "epoch": 0.16057142857142856,
      "grad_norm": 0.6869896650314331,
      "kl": 0.30859375,
      "learning_rate": 5.342952264838747e-05,
      "loss": -0.1307,
      "reward": 1.1992193013429642,
      "reward_std": 0.8062855824828148,
      "rewards/cosine_scaled_reward": 0.16210963344201446,
      "rewards/format_reward": 0.8750000149011612,
      "step": 281
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2695.5416870117188,
      "epoch": 0.16114285714285714,
      "grad_norm": 0.2958724796772003,
      "kl": 0.48028564453125,
      "learning_rate": 5.311559558218603e-05,
      "loss": 0.0435,
      "reward": 0.9084782637655735,
      "reward_std": 0.9329799860715866,
      "rewards/cosine_scaled_reward": 0.10007247515022755,
      "rewards/format_reward": 0.7083333507180214,
      "step": 282
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2283.6250610351562,
      "epoch": 0.16171428571428573,
      "grad_norm": 0.6330475807189941,
      "kl": 0.239501953125,
      "learning_rate": 5.28017603591974e-05,
      "loss": -0.1108,
      "reward": 0.7594865150749683,
      "reward_std": 0.6004514396190643,
      "rewards/cosine_scaled_reward": -0.09942345693707466,
      "rewards/format_reward": 0.9583333432674408,
      "step": 283
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2050.458396911621,
      "epoch": 0.16228571428571428,
      "grad_norm": 0.22488415241241455,
      "kl": 0.25628662109375,
      "learning_rate": 5.248803227530763e-05,
      "loss": 0.019,
      "reward": 2.079008385539055,
      "reward_std": 0.725292238406837,
      "rewards/cosine_scaled_reward": 0.6020041219890118,
      "rewards/format_reward": 0.875,
      "step": 284
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3271.541748046875,
      "epoch": 0.16285714285714287,
      "grad_norm": 0.2648293673992157,
      "kl": 0.39990234375,
      "learning_rate": 5.2174426621180906e-05,
      "loss": 0.0162,
      "reward": 0.347307525575161,
      "reward_std": 0.5538838356733322,
      "rewards/cosine_scaled_reward": -0.2430129125714302,
      "rewards/format_reward": 0.8333333432674408,
      "step": 285
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2571.7501220703125,
      "epoch": 0.16342857142857142,
      "grad_norm": 0.31099557876586914,
      "kl": 0.217041015625,
      "learning_rate": 5.186095868151436e-05,
      "loss": -0.0607,
      "reward": 1.441674392670393,
      "reward_std": 0.8816814571619034,
      "rewards/cosine_scaled_reward": 0.26250384002923965,
      "rewards/format_reward": 0.9166666716337204,
      "step": 286
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2525.791748046875,
      "epoch": 0.164,
      "grad_norm": 0.21399115025997162,
      "kl": 0.212890625,
      "learning_rate": 5.154764373429316e-05,
      "loss": 0.0111,
      "reward": 1.264048159122467,
      "reward_std": 1.0165133327245712,
      "rewards/cosine_scaled_reward": 0.17369072884321213,
      "rewards/format_reward": 0.9166666865348816,
      "step": 287
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3136.8750610351562,
      "epoch": 0.16457142857142856,
      "grad_norm": 0.2190590798854828,
      "kl": 0.3248291015625,
      "learning_rate": 5.1234497050045814e-05,
      "loss": -0.0187,
      "reward": 0.4037191644310951,
      "reward_std": 0.7680270224809647,
      "rewards/cosine_scaled_reward": -0.19397378712892532,
      "rewards/format_reward": 0.7916666865348816,
      "step": 288
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3075.8333740234375,
      "epoch": 0.16514285714285715,
      "grad_norm": 0.18252302706241608,
      "kl": 0.40673828125,
      "learning_rate": 5.0921533891099905e-05,
      "loss": 0.0377,
      "reward": 0.6799687976017594,
      "reward_std": 0.6346831023693085,
      "rewards/cosine_scaled_reward": -0.07668228447437286,
      "rewards/format_reward": 0.8333333432674408,
      "step": 289
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2687.8334350585938,
      "epoch": 0.1657142857142857,
      "grad_norm": 0.272157222032547,
      "kl": 0.2164306640625,
      "learning_rate": 5.0608769510838284e-05,
      "loss": -0.0162,
      "reward": 0.8302770014852285,
      "reward_std": 0.9631283730268478,
      "rewards/cosine_scaled_reward": -0.001528160646557808,
      "rewards/format_reward": 0.8333333432674408,
      "step": 290
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3265.8751220703125,
      "epoch": 0.1662857142857143,
      "grad_norm": 0.23784101009368896,
      "kl": 0.3359375,
      "learning_rate": 5.0296219152955604e-05,
      "loss": 0.0305,
      "reward": 0.386111356317997,
      "reward_std": 0.39381321892142296,
      "rewards/cosine_scaled_reward": -0.20277767814695835,
      "rewards/format_reward": 0.7916666865348816,
      "step": 291
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2972.5416870117188,
      "epoch": 0.16685714285714287,
      "grad_norm": 0.3053220510482788,
      "kl": 0.323974609375,
      "learning_rate": 4.998389805071536e-05,
      "loss": -0.0195,
      "reward": 0.5026585329324007,
      "reward_std": 0.9295567944645882,
      "rewards/cosine_scaled_reward": -0.12367073073983192,
      "rewards/format_reward": 0.7500000074505806,
      "step": 292
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2035.3333740234375,
      "epoch": 0.16742857142857143,
      "grad_norm": 0.15959811210632324,
      "kl": 0.122650146484375,
      "learning_rate": 4.9671821426207455e-05,
      "loss": 0.0064,
      "reward": 1.4870387986302376,
      "reward_std": 0.46400389447808266,
      "rewards/cosine_scaled_reward": 0.3060194104909897,
      "rewards/format_reward": 0.875,
      "step": 293
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2581.958465576172,
      "epoch": 0.168,
      "grad_norm": 0.2600279450416565,
      "kl": 0.1378173828125,
      "learning_rate": 4.936000448960631e-05,
      "loss": 0.0812,
      "reward": 0.6963506219908595,
      "reward_std": 0.8576374873518944,
      "rewards/cosine_scaled_reward": -0.047658056020736694,
      "rewards/format_reward": 0.7916666716337204,
      "step": 294
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2707.750045776367,
      "epoch": 0.16857142857142857,
      "grad_norm": 0.3398168385028839,
      "kl": 0.26220703125,
      "learning_rate": 4.904846243842949e-05,
      "loss": 0.0073,
      "reward": 0.1822656556032598,
      "reward_std": 0.7573360428214073,
      "rewards/cosine_scaled_reward": -0.17970050126314163,
      "rewards/format_reward": 0.5416666716337204,
      "step": 295
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2680.1251220703125,
      "epoch": 0.16914285714285715,
      "grad_norm": 0.29461199045181274,
      "kl": 0.2176513671875,
      "learning_rate": 4.873721045679707e-05,
      "loss": 0.0926,
      "reward": 0.17297326400876045,
      "reward_std": 0.5826424770057201,
      "rewards/cosine_scaled_reward": -0.26768005080521107,
      "rewards/format_reward": 0.7083333507180214,
      "step": 296
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2895.4584350585938,
      "epoch": 0.1697142857142857,
      "grad_norm": 0.14295834302902222,
      "kl": 0.1163330078125,
      "learning_rate": 4.842626371469149e-05,
      "loss": -0.0717,
      "reward": 1.922136515378952,
      "reward_std": 0.5652984231710434,
      "rewards/cosine_scaled_reward": 0.4819015748798847,
      "rewards/format_reward": 0.9583333432674408,
      "step": 297
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2346.041748046875,
      "epoch": 0.1702857142857143,
      "grad_norm": 0.172696053981781,
      "kl": 0.2733154296875,
      "learning_rate": 4.811563736721829e-05,
      "loss": 0.046,
      "reward": 0.902316652238369,
      "reward_std": 0.9037366360425949,
      "rewards/cosine_scaled_reward": 0.034491658210754395,
      "rewards/format_reward": 0.8333333432674408,
      "step": 298
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2209.250045776367,
      "epoch": 0.17085714285714285,
      "grad_norm": 0.44964268803596497,
      "kl": 0.1575927734375,
      "learning_rate": 4.780534655386744e-05,
      "loss": -0.0211,
      "reward": 0.48153945803642273,
      "reward_std": 0.6702739596366882,
      "rewards/cosine_scaled_reward": -0.15506362076848745,
      "rewards/format_reward": 0.791666679084301,
      "step": 299
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2662.041748046875,
      "epoch": 0.17142857142857143,
      "grad_norm": 0.29695504903793335,
      "kl": 0.177978515625,
      "learning_rate": 4.74954063977754e-05,
      "loss": 0.0111,
      "reward": 0.5166540406644344,
      "reward_std": 1.0730202198028564,
      "rewards/cosine_scaled_reward": -0.11667298898100853,
      "rewards/format_reward": 0.7500000149011612,
      "step": 300
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2761.3334350585938,
      "epoch": 0.172,
      "grad_norm": 0.21826083958148956,
      "kl": 0.321380615234375,
      "learning_rate": 4.718583200498814e-05,
      "loss": 0.0618,
      "reward": 0.18236689269542694,
      "reward_std": 0.7055843695998192,
      "rewards/cosine_scaled_reward": -0.2629832345992327,
      "rewards/format_reward": 0.708333358168602,
      "step": 301
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2820.3750610351562,
      "epoch": 0.17257142857142857,
      "grad_norm": 0.1797807216644287,
      "kl": 0.25360107421875,
      "learning_rate": 4.687663846372481e-05,
      "loss": -0.0163,
      "reward": 0.9326446410268545,
      "reward_std": 0.8454302102327347,
      "rewards/cosine_scaled_reward": 0.09132230095565319,
      "rewards/format_reward": 0.7500000149011612,
      "step": 302
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2805.791748046875,
      "epoch": 0.17314285714285715,
      "grad_norm": 0.20349080860614777,
      "kl": 0.25506591796875,
      "learning_rate": 4.6567840843642384e-05,
      "loss": 0.0379,
      "reward": 0.5512912534177303,
      "reward_std": 0.5454032495617867,
      "rewards/cosine_scaled_reward": -0.14102105796337128,
      "rewards/format_reward": 0.8333333432674408,
      "step": 303
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2817.4584350585938,
      "epoch": 0.1737142857142857,
      "grad_norm": 0.37047722935676575,
      "kl": 0.28692626953125,
      "learning_rate": 4.6259454195101274e-05,
      "loss": -0.1089,
      "reward": 0.6676226779818535,
      "reward_std": 0.9479784294962883,
      "rewards/cosine_scaled_reward": -0.04118867497891188,
      "rewards/format_reward": 0.7500000223517418,
      "step": 304
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2684.416748046875,
      "epoch": 0.1742857142857143,
      "grad_norm": 0.166043221950531,
      "kl": 0.2327880859375,
      "learning_rate": 4.5951493548431603e-05,
      "loss": 0.0119,
      "reward": 1.0854606181383133,
      "reward_std": 0.4668488036841154,
      "rewards/cosine_scaled_reward": 0.08439697325229645,
      "rewards/format_reward": 0.9166666865348816,
      "step": 305
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1833.25,
      "epoch": 0.17485714285714285,
      "grad_norm": 0.1900881677865982,
      "kl": 0.11285400390625,
      "learning_rate": 4.564397391320084e-05,
      "loss": 0.0551,
      "reward": 1.6040659546852112,
      "reward_std": 0.7503243815153837,
      "rewards/cosine_scaled_reward": 0.3645329400897026,
      "rewards/format_reward": 0.875,
      "step": 306
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3245.9583740234375,
      "epoch": 0.17542857142857143,
      "grad_norm": 0.39895114302635193,
      "kl": 0.1572265625,
      "learning_rate": 4.5336910277482156e-05,
      "loss": 0.0809,
      "reward": 0.28692900389432907,
      "reward_std": 0.8957325369119644,
      "rewards/cosine_scaled_reward": -0.14820216968655586,
      "rewards/format_reward": 0.5833333469927311,
      "step": 307
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2018.666748046875,
      "epoch": 0.176,
      "grad_norm": 0.32068708539009094,
      "kl": 0.10467529296875,
      "learning_rate": 4.503031760712397e-05,
      "loss": 0.077,
      "reward": 1.5325582474470139,
      "reward_std": 0.9100236482918262,
      "rewards/cosine_scaled_reward": 0.34961244463920593,
      "rewards/format_reward": 0.8333333358168602,
      "step": 308
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3323.8333740234375,
      "epoch": 0.17657142857142857,
      "grad_norm": 0.21846871078014374,
      "kl": 0.2919921875,
      "learning_rate": 4.47242108450205e-05,
      "loss": 0.0366,
      "reward": -0.005359284579753876,
      "reward_std": 0.415067620575428,
      "rewards/cosine_scaled_reward": -0.25267963111400604,
      "rewards/format_reward": 0.5000000223517418,
      "step": 309
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3336.8333740234375,
      "epoch": 0.17714285714285713,
      "grad_norm": 0.27864164113998413,
      "kl": 0.333984375,
      "learning_rate": 4.4418604910383456e-05,
      "loss": 0.0695,
      "reward": 0.5436939476057887,
      "reward_std": 0.7889657467603683,
      "rewards/cosine_scaled_reward": -0.1656530387699604,
      "rewards/format_reward": 0.8750000149011612,
      "step": 310
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3064.5000610351562,
      "epoch": 0.1777142857142857,
      "grad_norm": 0.14427867531776428,
      "kl": 0.224365234375,
      "learning_rate": 4.411351469801496e-05,
      "loss": 0.0368,
      "reward": 0.4646228328347206,
      "reward_std": 0.5353215932846069,
      "rewards/cosine_scaled_reward": -0.1010219119489193,
      "rewards/format_reward": 0.6666666679084301,
      "step": 311
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3088.2500610351562,
      "epoch": 0.1782857142857143,
      "grad_norm": 0.20969220995903015,
      "kl": 0.238525390625,
      "learning_rate": 4.380895507758155e-05,
      "loss": 0.0171,
      "reward": 0.33924252539873123,
      "reward_std": 0.3376295939087868,
      "rewards/cosine_scaled_reward": -0.2678787410259247,
      "rewards/format_reward": 0.8750000149011612,
      "step": 312
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2415.5000228881836,
      "epoch": 0.17885714285714285,
      "grad_norm": 1.5862038135528564,
      "kl": 0.308349609375,
      "learning_rate": 4.3504940892889434e-05,
      "loss": 0.0216,
      "reward": 0.808872826397419,
      "reward_std": 0.7256791153922677,
      "rewards/cosine_scaled_reward": -0.012230251293658512,
      "rewards/format_reward": 0.8333333358168602,
      "step": 313
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2261.12508392334,
      "epoch": 0.17942857142857144,
      "grad_norm": 0.28129690885543823,
      "kl": 0.21649169921875,
      "learning_rate": 4.3201486961161094e-05,
      "loss": 0.1205,
      "reward": 1.4821401089429855,
      "reward_std": 0.9167400598526001,
      "rewards/cosine_scaled_reward": 0.26190340146422386,
      "rewards/format_reward": 0.9583333432674408,
      "step": 314
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2525.500045776367,
      "epoch": 0.18,
      "grad_norm": 0.17051948606967926,
      "kl": 0.2293701171875,
      "learning_rate": 4.289860807231305e-05,
      "loss": 0.09,
      "reward": 0.16041448712348938,
      "reward_std": 0.3049400746822357,
      "rewards/cosine_scaled_reward": -0.294792752712965,
      "rewards/format_reward": 0.7500000074505806,
      "step": 315
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3395.791748046875,
      "epoch": 0.18057142857142858,
      "grad_norm": 0.17187951505184174,
      "kl": 0.312255859375,
      "learning_rate": 4.259631898823504e-05,
      "loss": 0.0457,
      "reward": 0.3425696883350611,
      "reward_std": 0.9086843878030777,
      "rewards/cosine_scaled_reward": -0.18288182839751244,
      "rewards/format_reward": 0.708333358168602,
      "step": 316
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2967.2916870117188,
      "epoch": 0.18114285714285713,
      "grad_norm": 0.6705901622772217,
      "kl": 0.21875,
      "learning_rate": 4.229463444207056e-05,
      "loss": 0.0771,
      "reward": 0.533269502222538,
      "reward_std": 0.7537828385829926,
      "rewards/cosine_scaled_reward": -0.08753193635493517,
      "rewards/format_reward": 0.7083333507180214,
      "step": 317
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2270.041748046875,
      "epoch": 0.18171428571428572,
      "grad_norm": 0.26364800333976746,
      "kl": 0.19378662109375,
      "learning_rate": 4.1993569137498776e-05,
      "loss": 0.0659,
      "reward": 1.1275322251021862,
      "reward_std": 0.8576941788196564,
      "rewards/cosine_scaled_reward": 0.1262660927604884,
      "rewards/format_reward": 0.8750000149011612,
      "step": 318
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2608.541748046875,
      "epoch": 0.18228571428571427,
      "grad_norm": 0.3073921203613281,
      "kl": 0.302001953125,
      "learning_rate": 4.1693137748017916e-05,
      "loss": 0.0071,
      "reward": 0.6904770843684673,
      "reward_std": 0.8109993487596512,
      "rewards/cosine_scaled_reward": -0.09226147457957268,
      "rewards/format_reward": 0.8750000149011612,
      "step": 319
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2851.1666870117188,
      "epoch": 0.18285714285714286,
      "grad_norm": 0.21277567744255066,
      "kl": 0.29742431640625,
      "learning_rate": 4.1393354916230006e-05,
      "loss": 0.0273,
      "reward": 0.687724407762289,
      "reward_std": 1.0126019269227982,
      "rewards/cosine_scaled_reward": 0.031362203881144524,
      "rewards/format_reward": 0.6250000074505806,
      "step": 320
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3186.9583740234375,
      "epoch": 0.18342857142857144,
      "grad_norm": 0.26291391253471375,
      "kl": 0.435791015625,
      "learning_rate": 4.109423525312738e-05,
      "loss": 0.0092,
      "reward": 0.3795064650475979,
      "reward_std": 0.5648243799805641,
      "rewards/cosine_scaled_reward": -0.2477467879652977,
      "rewards/format_reward": 0.8750000149011612,
      "step": 321
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3135.6668090820312,
      "epoch": 0.184,
      "grad_norm": 0.16939568519592285,
      "kl": 0.41015625,
      "learning_rate": 4.079579333738039e-05,
      "loss": 0.0569,
      "reward": 0.5801227353513241,
      "reward_std": 0.6866341158747673,
      "rewards/cosine_scaled_reward": -0.06410535424947739,
      "rewards/format_reward": 0.7083333656191826,
      "step": 322
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2883.9584350585938,
      "epoch": 0.18457142857142858,
      "grad_norm": 0.3012012243270874,
      "kl": 0.3544921875,
      "learning_rate": 4.049804371462701e-05,
      "loss": 0.0825,
      "reward": 0.8750679045915604,
      "reward_std": 0.5257211327552795,
      "rewards/cosine_scaled_reward": 0.020867276936769485,
      "rewards/format_reward": 0.8333333432674408,
      "step": 323
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2720.7083435058594,
      "epoch": 0.18514285714285714,
      "grad_norm": 0.16761939227581024,
      "kl": 0.320556640625,
      "learning_rate": 4.0201000896763766e-05,
      "loss": 0.0441,
      "reward": 0.7150210291147232,
      "reward_std": 0.9162970930337906,
      "rewards/cosine_scaled_reward": -0.05915616638958454,
      "rewards/format_reward": 0.8333333432674408,
      "step": 324
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2600.541748046875,
      "epoch": 0.18571428571428572,
      "grad_norm": 0.28921419382095337,
      "kl": 0.536865234375,
      "learning_rate": 3.9904679361238525e-05,
      "loss": 0.0768,
      "reward": 0.5437839552760124,
      "reward_std": 0.8164877891540527,
      "rewards/cosine_scaled_reward": -0.04060804285109043,
      "rewards/format_reward": 0.625,
      "step": 325
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2133.5000762939453,
      "epoch": 0.18628571428571428,
      "grad_norm": 0.28011372685432434,
      "kl": 0.3050537109375,
      "learning_rate": 3.960909355034491e-05,
      "loss": -0.0025,
      "reward": 0.789381206035614,
      "reward_std": 0.8102166727185249,
      "rewards/cosine_scaled_reward": -0.021976079791784286,
      "rewards/format_reward": 0.8333333358168602,
      "step": 326
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2548.2083740234375,
      "epoch": 0.18685714285714286,
      "grad_norm": 0.44690772891044617,
      "kl": 0.39697265625,
      "learning_rate": 3.9314257870518325e-05,
      "loss": 0.2676,
      "reward": 0.8154665417969227,
      "reward_std": 0.568376112729311,
      "rewards/cosine_scaled_reward": -0.00893339142203331,
      "rewards/format_reward": 0.833333358168602,
      "step": 327
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2082.916748046875,
      "epoch": 0.18742857142857142,
      "grad_norm": 0.39973941445350647,
      "kl": 0.22576904296875,
      "learning_rate": 3.902018669163384e-05,
      "loss": -0.0253,
      "reward": 0.7669844925403595,
      "reward_std": 0.7145446315407753,
      "rewards/cosine_scaled_reward": -0.0956744309514761,
      "rewards/format_reward": 0.9583333432674408,
      "step": 328
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2922.6666870117188,
      "epoch": 0.188,
      "grad_norm": 0.33824387192726135,
      "kl": 0.66064453125,
      "learning_rate": 3.872689434630585e-05,
      "loss": 0.1195,
      "reward": 0.8472144799306989,
      "reward_std": 0.8882746696472168,
      "rewards/cosine_scaled_reward": 0.1111072227358818,
      "rewards/format_reward": 0.6250000260770321,
      "step": 329
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2951.3333740234375,
      "epoch": 0.18857142857142858,
      "grad_norm": 0.22268566489219666,
      "kl": 0.62255859375,
      "learning_rate": 3.843439512918949e-05,
      "loss": 0.1063,
      "reward": 0.6393527542240918,
      "reward_std": 0.9115183800458908,
      "rewards/cosine_scaled_reward": 0.028009682893753052,
      "rewards/format_reward": 0.5833333507180214,
      "step": 330
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1935.0000762939453,
      "epoch": 0.18914285714285714,
      "grad_norm": 0.4056748151779175,
      "kl": 0.3223876953125,
      "learning_rate": 3.814270329628396e-05,
      "loss": -0.0437,
      "reward": 0.987700991332531,
      "reward_std": 0.7533665373921394,
      "rewards/cosine_scaled_reward": 0.07718382868915796,
      "rewards/format_reward": 0.8333333432674408,
      "step": 331
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2499.125030517578,
      "epoch": 0.18971428571428572,
      "grad_norm": 0.40982890129089355,
      "kl": 0.327392578125,
      "learning_rate": 3.785183306423768e-05,
      "loss": -0.0806,
      "reward": 0.9079742878675461,
      "reward_std": 0.9995783120393753,
      "rewards/cosine_scaled_reward": -0.025179538875818253,
      "rewards/format_reward": 0.9583333432674408,
      "step": 332
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2600.2084197998047,
      "epoch": 0.19028571428571428,
      "grad_norm": 0.22521264851093292,
      "kl": 0.434326171875,
      "learning_rate": 3.756179860965538e-05,
      "loss": 0.1256,
      "reward": 0.21467669680714607,
      "reward_std": 0.7439640909433365,
      "rewards/cosine_scaled_reward": -0.2259949930012226,
      "rewards/format_reward": 0.666666679084301,
      "step": 333
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3179.3333740234375,
      "epoch": 0.19085714285714286,
      "grad_norm": 0.3684881627559662,
      "kl": 0.6171875,
      "learning_rate": 3.7272614068407205e-05,
      "loss": 0.044,
      "reward": 0.2664009025320411,
      "reward_std": 0.7427940741181374,
      "rewards/cosine_scaled_reward": -0.15846621617674828,
      "rewards/format_reward": 0.5833333395421505,
      "step": 334
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1974.791732788086,
      "epoch": 0.19142857142857142,
      "grad_norm": 0.19373755156993866,
      "kl": 0.152557373046875,
      "learning_rate": 3.698429353493974e-05,
      "loss": 0.0285,
      "reward": 0.8958619683980942,
      "reward_std": 0.7756932191550732,
      "rewards/cosine_scaled_reward": -0.052069032564759254,
      "rewards/format_reward": 1.0,
      "step": 335
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2503.416732788086,
      "epoch": 0.192,
      "grad_norm": 0.4300239682197571,
      "kl": 0.178192138671875,
      "learning_rate": 3.6696851061589e-05,
      "loss": 0.0114,
      "reward": 1.0004391744732857,
      "reward_std": 0.7523290365934372,
      "rewards/cosine_scaled_reward": 0.08355289697647095,
      "rewards/format_reward": 0.8333333432674408,
      "step": 336
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2300.5833587646484,
      "epoch": 0.19257142857142856,
      "grad_norm": 0.21802906692028046,
      "kl": 0.3260498046875,
      "learning_rate": 3.6410300657895626e-05,
      "loss": 0.1048,
      "reward": 1.2021107599139214,
      "reward_std": 0.6737323552370071,
      "rewards/cosine_scaled_reward": 0.18438871018588543,
      "rewards/format_reward": 0.833333358168602,
      "step": 337
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2349.083396911621,
      "epoch": 0.19314285714285714,
      "grad_norm": 0.23494723439216614,
      "kl": 0.295654296875,
      "learning_rate": 3.6124656289922034e-05,
      "loss": 0.0219,
      "reward": 1.5479250699281693,
      "reward_std": 1.1081312000751495,
      "rewards/cosine_scaled_reward": 0.3364625433459878,
      "rewards/format_reward": 0.8750000149011612,
      "step": 338
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3243.4584350585938,
      "epoch": 0.19371428571428573,
      "grad_norm": 0.22037623822689056,
      "kl": 0.5400390625,
      "learning_rate": 3.583993187957173e-05,
      "loss": 0.0615,
      "reward": 0.39422329515218735,
      "reward_std": 0.7927843853831291,
      "rewards/cosine_scaled_reward": -0.13622168637812138,
      "rewards/format_reward": 0.666666679084301,
      "step": 339
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2087.750030517578,
      "epoch": 0.19428571428571428,
      "grad_norm": 0.2113656848669052,
      "kl": 0.368560791015625,
      "learning_rate": 3.5556141303910795e-05,
      "loss": 0.1216,
      "reward": 0.583352442830801,
      "reward_std": 1.0455666035413742,
      "rewards/cosine_scaled_reward": -0.062490461859852076,
      "rewards/format_reward": 0.7083333358168602,
      "step": 340
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1610.5416717529297,
      "epoch": 0.19485714285714287,
      "grad_norm": 0.13782458007335663,
      "kl": 0.204315185546875,
      "learning_rate": 3.5273298394491515e-05,
      "loss": 0.0774,
      "reward": 0.5869630854576826,
      "reward_std": 0.42929424345493317,
      "rewards/cosine_scaled_reward": -0.1648518219590187,
      "rewards/format_reward": 0.9166666716337204,
      "step": 341
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2041.5417175292969,
      "epoch": 0.19542857142857142,
      "grad_norm": 0.2814924716949463,
      "kl": 0.20819091796875,
      "learning_rate": 3.499141693667828e-05,
      "loss": 0.1013,
      "reward": 0.8442784734070301,
      "reward_std": 0.7920543104410172,
      "rewards/cosine_scaled_reward": 0.026305895298719406,
      "rewards/format_reward": 0.7916666716337204,
      "step": 342
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2508.666748046875,
      "epoch": 0.196,
      "grad_norm": 0.24862973392009735,
      "kl": 0.31884765625,
      "learning_rate": 3.4710510668975624e-05,
      "loss": 0.0598,
      "reward": 0.3385371249169111,
      "reward_std": 0.6598921939730644,
      "rewards/cosine_scaled_reward": -0.2473981073126197,
      "rewards/format_reward": 0.8333333432674408,
      "step": 343
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2231.833396911621,
      "epoch": 0.19657142857142856,
      "grad_norm": 0.22046416997909546,
      "kl": 0.376220703125,
      "learning_rate": 3.443059328235878e-05,
      "loss": -0.0093,
      "reward": 1.2481789495795965,
      "reward_std": 0.9312818646430969,
      "rewards/cosine_scaled_reward": 0.18658944219350815,
      "rewards/format_reward": 0.875,
      "step": 344
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1617.5417022705078,
      "epoch": 0.19714285714285715,
      "grad_norm": 0.3377382159233093,
      "kl": 0.17791748046875,
      "learning_rate": 3.415167841960624e-05,
      "loss": 0.1082,
      "reward": 0.9055377095937729,
      "reward_std": 0.7036742344498634,
      "rewards/cosine_scaled_reward": 0.015268810093402863,
      "rewards/format_reward": 0.8750000149011612,
      "step": 345
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3182.4584350585938,
      "epoch": 0.1977142857142857,
      "grad_norm": 0.24967238306999207,
      "kl": 0.5107421875,
      "learning_rate": 3.387377967463493e-05,
      "loss": 0.0664,
      "reward": 1.011141985654831,
      "reward_std": 0.9729516059160233,
      "rewards/cosine_scaled_reward": 0.10973765794187784,
      "rewards/format_reward": 0.7916666865348816,
      "step": 346
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2438.166748046875,
      "epoch": 0.1982857142857143,
      "grad_norm": 0.35565564036369324,
      "kl": 0.45068359375,
      "learning_rate": 3.359691059183761e-05,
      "loss": -0.0418,
      "reward": 0.5317260958254337,
      "reward_std": 0.744279682636261,
      "rewards/cosine_scaled_reward": -0.08830362930893898,
      "rewards/format_reward": 0.7083333432674408,
      "step": 347
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2034.1250915527344,
      "epoch": 0.19885714285714284,
      "grad_norm": 0.5289275050163269,
      "kl": 0.27899169921875,
      "learning_rate": 3.3321084665422807e-05,
      "loss": -0.0943,
      "reward": 0.5254748985171318,
      "reward_std": 0.6338090598583221,
      "rewards/cosine_scaled_reward": -0.17476258054375648,
      "rewards/format_reward": 0.8750000149011612,
      "step": 348
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3002.916748046875,
      "epoch": 0.19942857142857143,
      "grad_norm": 0.23819538950920105,
      "kl": 0.419189453125,
      "learning_rate": 3.304631533875703e-05,
      "loss": 0.0293,
      "reward": 0.3799813613295555,
      "reward_std": 0.8570699989795685,
      "rewards/cosine_scaled_reward": -0.16417600587010384,
      "rewards/format_reward": 0.708333358168602,
      "step": 349
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1831.2500305175781,
      "epoch": 0.2,
      "grad_norm": 0.12972407042980194,
      "kl": 0.311248779296875,
      "learning_rate": 3.2772616003709614e-05,
      "loss": 0.0679,
      "reward": 1.1549129895865917,
      "reward_std": 0.5767290014773607,
      "rewards/cosine_scaled_reward": 0.22328981384634972,
      "rewards/format_reward": 0.7083333358168602,
      "step": 350
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2462.041717529297,
      "epoch": 0.20057142857142857,
      "grad_norm": 0.3680292069911957,
      "kl": 0.3271484375,
      "learning_rate": 3.250000000000001e-05,
      "loss": -0.0844,
      "reward": 0.6425135992467403,
      "reward_std": 0.7020438965409994,
      "rewards/cosine_scaled_reward": -0.13707654364407063,
      "rewards/format_reward": 0.9166666716337204,
      "step": 351
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2185.2084197998047,
      "epoch": 0.20114285714285715,
      "grad_norm": 0.2795995771884918,
      "kl": 0.379791259765625,
      "learning_rate": 3.222848061454764e-05,
      "loss": 0.1466,
      "reward": 1.2067798674106598,
      "reward_std": 0.8647864162921906,
      "rewards/cosine_scaled_reward": 0.22838991414755583,
      "rewards/format_reward": 0.7500000149011612,
      "step": 352
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2554.1250610351562,
      "epoch": 0.2017142857142857,
      "grad_norm": 0.22771836817264557,
      "kl": 0.226806640625,
      "learning_rate": 3.195807108082429e-05,
      "loss": 0.0443,
      "reward": 0.6497006267309189,
      "reward_std": 0.7212181687355042,
      "rewards/cosine_scaled_reward": -0.1334830243140459,
      "rewards/format_reward": 0.9166666865348816,
      "step": 353
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2555.833465576172,
      "epoch": 0.2022857142857143,
      "grad_norm": 0.27769502997398376,
      "kl": 0.439697265625,
      "learning_rate": 3.168878457820915e-05,
      "loss": 0.0557,
      "reward": 0.36892162170261145,
      "reward_std": 0.725491639226675,
      "rewards/cosine_scaled_reward": -0.16970586776733398,
      "rewards/format_reward": 0.708333358168602,
      "step": 354
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2070.666748046875,
      "epoch": 0.20285714285714285,
      "grad_norm": 0.17403961718082428,
      "kl": 0.232147216796875,
      "learning_rate": 3.1420634231346445e-05,
      "loss": 0.0253,
      "reward": 0.6142217069864273,
      "reward_std": 0.37743623182177544,
      "rewards/cosine_scaled_reward": -0.13038915395736694,
      "rewards/format_reward": 0.8750000149011612,
      "step": 355
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2257.250030517578,
      "epoch": 0.20342857142857143,
      "grad_norm": 0.3374931812286377,
      "kl": 0.2642822265625,
      "learning_rate": 3.1153633109505784e-05,
      "loss": -0.1064,
      "reward": 1.8636004030704498,
      "reward_std": 0.7428180351853371,
      "rewards/cosine_scaled_reward": 0.4526335150003433,
      "rewards/format_reward": 0.9583333432674408,
      "step": 356
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2539.291717529297,
      "epoch": 0.204,
      "grad_norm": 0.2252262383699417,
      "kl": 0.33685302734375,
      "learning_rate": 3.088779422594514e-05,
      "loss": 0.0651,
      "reward": 0.2009668005630374,
      "reward_std": 0.5703963618725538,
      "rewards/cosine_scaled_reward": -0.29534994065761566,
      "rewards/format_reward": 0.7916666865348816,
      "step": 357
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2775.916748046875,
      "epoch": 0.20457142857142857,
      "grad_norm": 0.17728424072265625,
      "kl": 0.303619384765625,
      "learning_rate": 3.062313053727671e-05,
      "loss": 0.0615,
      "reward": 1.3498004898428917,
      "reward_std": 0.699789387639612,
      "rewards/cosine_scaled_reward": 0.19573353230953217,
      "rewards/format_reward": 0.9583333432674408,
      "step": 358
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1551.7500457763672,
      "epoch": 0.20514285714285715,
      "grad_norm": 0.23342669010162354,
      "kl": 0.131744384765625,
      "learning_rate": 3.0359654942835248e-05,
      "loss": 0.1226,
      "reward": 0.8648534566164017,
      "reward_std": 0.41398559510707855,
      "rewards/cosine_scaled_reward": -0.06757331639528275,
      "rewards/format_reward": 1.0,
      "step": 359
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2911.0834350585938,
      "epoch": 0.2057142857142857,
      "grad_norm": 0.20153023302555084,
      "kl": 0.409423828125,
      "learning_rate": 3.0097380284049527e-05,
      "loss": 0.0591,
      "reward": 0.5484894886612892,
      "reward_std": 0.8311030864715576,
      "rewards/cosine_scaled_reward": -0.12158861197531223,
      "rewards/format_reward": 0.7916666716337204,
      "step": 360
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2764.166748046875,
      "epoch": 0.2062857142857143,
      "grad_norm": 0.2474966049194336,
      "kl": 0.27166748046875,
      "learning_rate": 2.98363193438164e-05,
      "loss": 0.0633,
      "reward": 1.1824394315481186,
      "reward_std": 0.5549486838281155,
      "rewards/cosine_scaled_reward": 0.13288633804768324,
      "rewards/format_reward": 0.9166666865348816,
      "step": 361
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2818.0416870117188,
      "epoch": 0.20685714285714285,
      "grad_norm": 0.27774032950401306,
      "kl": 0.40234375,
      "learning_rate": 2.9576484845877794e-05,
      "loss": 0.1074,
      "reward": 0.7827701717615128,
      "reward_std": 0.8135464563965797,
      "rewards/cosine_scaled_reward": 0.016385079594329,
      "rewards/format_reward": 0.7500000223517418,
      "step": 362
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1643.9583740234375,
      "epoch": 0.20742857142857143,
      "grad_norm": 0.21469183266162872,
      "kl": 0.1004638671875,
      "learning_rate": 2.931788945420058e-05,
      "loss": 0.0814,
      "reward": 1.152499184012413,
      "reward_std": 0.48752832412719727,
      "rewards/cosine_scaled_reward": 0.07624955475330353,
      "rewards/format_reward": 1.0,
      "step": 363
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2685.2501068115234,
      "epoch": 0.208,
      "grad_norm": 0.228445902466774,
      "kl": 0.30401611328125,
      "learning_rate": 2.906054577235931e-05,
      "loss": 0.0817,
      "reward": 1.0254673808813095,
      "reward_std": 0.894573763012886,
      "rewards/cosine_scaled_reward": 0.09606703370809555,
      "rewards/format_reward": 0.833333358168602,
      "step": 364
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2010.1666870117188,
      "epoch": 0.20857142857142857,
      "grad_norm": 0.2819450795650482,
      "kl": 0.1759033203125,
      "learning_rate": 2.880446634292199e-05,
      "loss": -0.0103,
      "reward": 0.446915403008461,
      "reward_std": 0.32444334402680397,
      "rewards/cosine_scaled_reward": -0.2765423096716404,
      "rewards/format_reward": 1.0,
      "step": 365
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3228.9168090820312,
      "epoch": 0.20914285714285713,
      "grad_norm": 0.27710995078086853,
      "kl": 0.38671875,
      "learning_rate": 2.854966364683872e-05,
      "loss": -0.0126,
      "reward": 0.35312827420420945,
      "reward_std": 0.4395810291171074,
      "rewards/cosine_scaled_reward": -0.17760252580046654,
      "rewards/format_reward": 0.7083333507180214,
      "step": 366
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2320.5001068115234,
      "epoch": 0.20971428571428571,
      "grad_norm": 0.20172470808029175,
      "kl": 0.324066162109375,
      "learning_rate": 2.829615010283344e-05,
      "loss": 0.0833,
      "reward": 1.231175735592842,
      "reward_std": 1.0247896611690521,
      "rewards/cosine_scaled_reward": 0.2614212017506361,
      "rewards/format_reward": 0.7083333358168602,
      "step": 367
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2057.875030517578,
      "epoch": 0.2102857142857143,
      "grad_norm": 0.3990497589111328,
      "kl": 0.275390625,
      "learning_rate": 2.8043938066798646e-05,
      "loss": 0.0937,
      "reward": 1.4740150086581707,
      "reward_std": 0.6022031959146261,
      "rewards/cosine_scaled_reward": 0.2995075099170208,
      "rewards/format_reward": 0.8750000149011612,
      "step": 368
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2476.750045776367,
      "epoch": 0.21085714285714285,
      "grad_norm": 0.22831466794013977,
      "kl": 0.36370849609375,
      "learning_rate": 2.7793039831193136e-05,
      "loss": -0.0094,
      "reward": 0.7754583209753036,
      "reward_std": 0.6475037336349487,
      "rewards/cosine_scaled_reward": -0.008104167878627777,
      "rewards/format_reward": 0.791666679084301,
      "step": 369
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2979.3334350585938,
      "epoch": 0.21142857142857144,
      "grad_norm": 0.2895065248012543,
      "kl": 0.324951171875,
      "learning_rate": 2.754346762444296e-05,
      "loss": 0.0505,
      "reward": 0.7149831403512508,
      "reward_std": 0.612546693533659,
      "rewards/cosine_scaled_reward": -0.08000845834612846,
      "rewards/format_reward": 0.8750000298023224,
      "step": 370
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1844.6666870117188,
      "epoch": 0.212,
      "grad_norm": 0.2284790277481079,
      "kl": 0.42205810546875,
      "learning_rate": 2.729523361034538e-05,
      "loss": 0.1355,
      "reward": 0.6728616314940155,
      "reward_std": 0.2811877764761448,
      "rewards/cosine_scaled_reward": 0.044764142483472824,
      "rewards/format_reward": 0.5833333358168602,
      "step": 371
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2974.4583740234375,
      "epoch": 0.21257142857142858,
      "grad_norm": 0.3309856951236725,
      "kl": 0.2061767578125,
      "learning_rate": 2.7048349887476037e-05,
      "loss": 0.0268,
      "reward": 0.7733481526374817,
      "reward_std": 0.7097474634647369,
      "rewards/cosine_scaled_reward": -0.009159276261925697,
      "rewards/format_reward": 0.7916666716337204,
      "step": 372
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2476.291748046875,
      "epoch": 0.21314285714285713,
      "grad_norm": 0.19564294815063477,
      "kl": 0.3516845703125,
      "learning_rate": 2.6802828488599297e-05,
      "loss": 0.0353,
      "reward": 1.6214977502822876,
      "reward_std": 0.9677339103072882,
      "rewards/cosine_scaled_reward": 0.3940822258591652,
      "rewards/format_reward": 0.8333333432674408,
      "step": 373
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3004.2083740234375,
      "epoch": 0.21371428571428572,
      "grad_norm": 0.2780701518058777,
      "kl": 0.34326171875,
      "learning_rate": 2.6558681380081713e-05,
      "loss": 0.0384,
      "reward": 0.8554385527968407,
      "reward_std": 0.8751993477344513,
      "rewards/cosine_scaled_reward": 0.03188592568039894,
      "rewards/format_reward": 0.7916667014360428,
      "step": 374
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2040.041748046875,
      "epoch": 0.21428571428571427,
      "grad_norm": 0.2669806480407715,
      "kl": 0.29327392578125,
      "learning_rate": 2.6315920461308964e-05,
      "loss": 0.039,
      "reward": 1.2006212025880814,
      "reward_std": 0.6928756944835186,
      "rewards/cosine_scaled_reward": 0.18364391289651394,
      "rewards/format_reward": 0.8333333432674408,
      "step": 375
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2124.416717529297,
      "epoch": 0.21485714285714286,
      "grad_norm": 0.7000332474708557,
      "kl": 0.2510986328125,
      "learning_rate": 2.6074557564105727e-05,
      "loss": -0.1579,
      "reward": 1.4331469386816025,
      "reward_std": 0.778608538210392,
      "rewards/cosine_scaled_reward": 0.23740678373724222,
      "rewards/format_reward": 0.9583333432674408,
      "step": 376
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2067.5834045410156,
      "epoch": 0.21542857142857144,
      "grad_norm": 0.19451673328876495,
      "kl": 0.312744140625,
      "learning_rate": 2.5834604452159112e-05,
      "loss": 0.1316,
      "reward": 0.983905091881752,
      "reward_std": 0.9569895938038826,
      "rewards/cosine_scaled_reward": 0.09611918777227402,
      "rewards/format_reward": 0.7916666716337204,
      "step": 377
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2343.1250915527344,
      "epoch": 0.216,
      "grad_norm": 0.4488096535205841,
      "kl": 0.331787109375,
      "learning_rate": 2.5596072820445254e-05,
      "loss": 0.0512,
      "reward": 0.4893091805279255,
      "reward_std": 0.574474148452282,
      "rewards/cosine_scaled_reward": -0.15117877395823598,
      "rewards/format_reward": 0.7916666716337204,
      "step": 378
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2728.0000915527344,
      "epoch": 0.21657142857142858,
      "grad_norm": 0.21385972201824188,
      "kl": 0.462890625,
      "learning_rate": 2.5358974294659375e-05,
      "loss": 0.0806,
      "reward": 0.12568058911710978,
      "reward_std": 0.8110382407903671,
      "rewards/cosine_scaled_reward": -0.2079930566251278,
      "rewards/format_reward": 0.5416666753590107,
      "step": 379
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2148.8750915527344,
      "epoch": 0.21714285714285714,
      "grad_norm": 0.21321555972099304,
      "kl": 0.2987060546875,
      "learning_rate": 2.5123320430649133e-05,
      "loss": 0.0423,
      "reward": 1.0129856215789914,
      "reward_std": 0.6403012797236443,
      "rewards/cosine_scaled_reward": 0.08982611820101738,
      "rewards/format_reward": 0.8333333358168602,
      "step": 380
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1976.8750915527344,
      "epoch": 0.21771428571428572,
      "grad_norm": 0.2554902136325836,
      "kl": 0.26251220703125,
      "learning_rate": 2.4889122713851394e-05,
      "loss": 0.058,
      "reward": 0.7505160495638847,
      "reward_std": 0.803048387169838,
      "rewards/cosine_scaled_reward": 0.0002580210566520691,
      "rewards/format_reward": 0.7500000111758709,
      "step": 381
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2567.9584197998047,
      "epoch": 0.21828571428571428,
      "grad_norm": 0.23561526834964752,
      "kl": 0.4429931640625,
      "learning_rate": 2.4656392558732464e-05,
      "loss": 0.1233,
      "reward": 0.5255677588284016,
      "reward_std": 0.7546349912881851,
      "rewards/cosine_scaled_reward": -0.02888280153274536,
      "rewards/format_reward": 0.5833333432674408,
      "step": 382
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1854.666748046875,
      "epoch": 0.21885714285714286,
      "grad_norm": 0.41661813855171204,
      "kl": 0.24273681640625,
      "learning_rate": 2.442514130823177e-05,
      "loss": 0.1697,
      "reward": 0.5279544293880463,
      "reward_std": 0.7327054888010025,
      "rewards/cosine_scaled_reward": -0.1526894560083747,
      "rewards/format_reward": 0.8333333432674408,
      "step": 383
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2985.416748046875,
      "epoch": 0.21942857142857142,
      "grad_norm": 0.47805020213127136,
      "kl": 0.41455078125,
      "learning_rate": 2.4195380233209008e-05,
      "loss": 0.0078,
      "reward": 0.6774251461029053,
      "reward_std": 0.5871247202157974,
      "rewards/cosine_scaled_reward": -0.07795410230755806,
      "rewards/format_reward": 0.8333333730697632,
      "step": 384
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1744.916732788086,
      "epoch": 0.22,
      "grad_norm": 0.22037595510482788,
      "kl": 0.257568359375,
      "learning_rate": 2.396712053189486e-05,
      "loss": 0.1639,
      "reward": 0.9186278469860554,
      "reward_std": 0.33239728957414627,
      "rewards/cosine_scaled_reward": 0.021813903003931046,
      "rewards/format_reward": 0.8750000149011612,
      "step": 385
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2539.1666870117188,
      "epoch": 0.22057142857142858,
      "grad_norm": 0.24289961159229279,
      "kl": 0.399169921875,
      "learning_rate": 2.374037332934512e-05,
      "loss": 0.0472,
      "reward": 0.2605516407638788,
      "reward_std": 0.49868740141391754,
      "rewards/cosine_scaled_reward": -0.20305750891566277,
      "rewards/format_reward": 0.666666679084301,
      "step": 386
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1674.8750228881836,
      "epoch": 0.22114285714285714,
      "grad_norm": 0.2652975022792816,
      "kl": 0.3177490234375,
      "learning_rate": 2.3515149676898555e-05,
      "loss": 0.0461,
      "reward": 1.29573517665267,
      "reward_std": 0.6526281535625458,
      "rewards/cosine_scaled_reward": 0.2520342655479908,
      "rewards/format_reward": 0.791666679084301,
      "step": 387
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1416.9166870117188,
      "epoch": 0.22171428571428572,
      "grad_norm": 0.22367416322231293,
      "kl": 0.12982177734375,
      "learning_rate": 2.329146055163824e-05,
      "loss": 0.0651,
      "reward": 1.4672380983829498,
      "reward_std": 0.9607173055410385,
      "rewards/cosine_scaled_reward": 0.27528570708818734,
      "rewards/format_reward": 0.9166666716337204,
      "step": 388
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1106.2500457763672,
      "epoch": 0.22228571428571428,
      "grad_norm": 0.16189609467983246,
      "kl": 0.0909576416015625,
      "learning_rate": 2.306931685585657e-05,
      "loss": 0.0612,
      "reward": 1.3787225484848022,
      "reward_std": 0.8911682516336441,
      "rewards/cosine_scaled_reward": 0.23102791607379913,
      "rewards/format_reward": 0.9166666716337204,
      "step": 389
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2349.7500915527344,
      "epoch": 0.22285714285714286,
      "grad_norm": 0.24648283421993256,
      "kl": 0.2828369140625,
      "learning_rate": 2.284872941652386e-05,
      "loss": 0.0402,
      "reward": 0.3927423320710659,
      "reward_std": 0.49121467769145966,
      "rewards/cosine_scaled_reward": -0.26196216978132725,
      "rewards/format_reward": 0.9166666716337204,
      "step": 390
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2174.7083740234375,
      "epoch": 0.22342857142857142,
      "grad_norm": 0.2675975263118744,
      "kl": 0.111419677734375,
      "learning_rate": 2.2629708984760708e-05,
      "loss": 0.0141,
      "reward": 0.9644523113965988,
      "reward_std": 0.7060663215816021,
      "rewards/cosine_scaled_reward": -0.017773881554603577,
      "rewards/format_reward": 1.0,
      "step": 391
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2664.3334045410156,
      "epoch": 0.224,
      "grad_norm": 0.2830732762813568,
      "kl": 0.30059814453125,
      "learning_rate": 2.2412266235313975e-05,
      "loss": -0.0216,
      "reward": 0.530030932277441,
      "reward_std": 0.6014800369739532,
      "rewards/cosine_scaled_reward": -0.17248454061336815,
      "rewards/format_reward": 0.875,
      "step": 392
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2592.7084197998047,
      "epoch": 0.22457142857142856,
      "grad_norm": 0.18396273255348206,
      "kl": 0.33929443359375,
      "learning_rate": 2.219641176603649e-05,
      "loss": 0.083,
      "reward": 0.20931893214583397,
      "reward_std": 0.35438287258148193,
      "rewards/cosine_scaled_reward": -0.29117387905716896,
      "rewards/format_reward": 0.7916666716337204,
      "step": 393
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1201.6667175292969,
      "epoch": 0.22514285714285714,
      "grad_norm": 0.24478864669799805,
      "kl": 0.10565185546875,
      "learning_rate": 2.198215609737056e-05,
      "loss": 0.026,
      "reward": 0.9410552708432078,
      "reward_std": 0.8450669944286346,
      "rewards/cosine_scaled_reward": -0.008639028761535883,
      "rewards/format_reward": 0.9583333432674408,
      "step": 394
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2590.3750610351562,
      "epoch": 0.2257142857142857,
      "grad_norm": 0.24432548880577087,
      "kl": 0.191162109375,
      "learning_rate": 2.1769509671835224e-05,
      "loss": -0.0402,
      "reward": 1.1612588688731194,
      "reward_std": 0.8244020491838455,
      "rewards/cosine_scaled_reward": 0.08062941022217274,
      "rewards/format_reward": 1.0,
      "step": 395
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2692.1250610351562,
      "epoch": 0.22628571428571428,
      "grad_norm": 0.2695596218109131,
      "kl": 0.24951171875,
      "learning_rate": 2.1558482853517257e-05,
      "loss": 0.0775,
      "reward": 0.7021965757012367,
      "reward_std": 0.6225183010101318,
      "rewards/cosine_scaled_reward": -0.06556838750839233,
      "rewards/format_reward": 0.8333333432674408,
      "step": 396
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2493.2084197998047,
      "epoch": 0.22685714285714287,
      "grad_norm": 0.8182937502861023,
      "kl": 0.2171630859375,
      "learning_rate": 2.1349085927566073e-05,
      "loss": 0.0479,
      "reward": 1.2701049419119954,
      "reward_std": 0.675561960786581,
      "rewards/cosine_scaled_reward": 0.28088577929884195,
      "rewards/format_reward": 0.7083333358168602,
      "step": 397
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1911.6250305175781,
      "epoch": 0.22742857142857142,
      "grad_norm": 0.2450830191373825,
      "kl": 0.1436767578125,
      "learning_rate": 2.114132909969241e-05,
      "loss": -0.0067,
      "reward": 0.8501160591840744,
      "reward_std": 0.6082466319203377,
      "rewards/cosine_scaled_reward": -0.03327532671391964,
      "rewards/format_reward": 0.9166666865348816,
      "step": 398
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2610.50008392334,
      "epoch": 0.228,
      "grad_norm": 0.3445512056350708,
      "kl": 0.45556640625,
      "learning_rate": 2.093522249567097e-05,
      "loss": 0.0639,
      "reward": 0.44864194467663765,
      "reward_std": 0.6613021939992905,
      "rewards/cosine_scaled_reward": -0.15067904442548752,
      "rewards/format_reward": 0.75,
      "step": 399
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 957.1666793823242,
      "epoch": 0.22857142857142856,
      "grad_norm": 0.19021636247634888,
      "kl": 0.042205810546875,
      "learning_rate": 2.0730776160846853e-05,
      "loss": 0.0533,
      "reward": 1.58597931265831,
      "reward_std": 0.6685393303632736,
      "rewards/cosine_scaled_reward": 0.2929896265268326,
      "rewards/format_reward": 1.0,
      "step": 400
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1739.2500457763672,
      "epoch": 0.22914285714285715,
      "grad_norm": 0.2134786993265152,
      "kl": 0.13818359375,
      "learning_rate": 2.0528000059645997e-05,
      "loss": 0.0305,
      "reward": 0.6482241563498974,
      "reward_std": 0.4021666403859854,
      "rewards/cosine_scaled_reward": -0.15505461394786835,
      "rewards/format_reward": 0.9583333432674408,
      "step": 401
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2346.7084197998047,
      "epoch": 0.2297142857142857,
      "grad_norm": 0.2189859002828598,
      "kl": 0.27001953125,
      "learning_rate": 2.0326904075089492e-05,
      "loss": 0.0286,
      "reward": 0.9544563218951225,
      "reward_std": 0.9867835119366646,
      "rewards/cosine_scaled_reward": 0.03972811624407768,
      "rewards/format_reward": 0.8750000149011612,
      "step": 402
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1908.1250305175781,
      "epoch": 0.2302857142857143,
      "grad_norm": 0.2624204158782959,
      "kl": 0.134124755859375,
      "learning_rate": 2.0127498008311922e-05,
      "loss": 0.0638,
      "reward": 1.8549927771091461,
      "reward_std": 0.986580029129982,
      "rewards/cosine_scaled_reward": 0.44832973554730415,
      "rewards/format_reward": 0.9583333432674408,
      "step": 403
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2161.166732788086,
      "epoch": 0.23085714285714284,
      "grad_norm": 0.25534167885780334,
      "kl": 0.202880859375,
      "learning_rate": 1.9929791578083658e-05,
      "loss": 0.0734,
      "reward": 0.6427417621016502,
      "reward_std": 0.7742570489645004,
      "rewards/cosine_scaled_reward": -0.11612912639975548,
      "rewards/format_reward": 0.875,
      "step": 404
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2189.333396911621,
      "epoch": 0.23142857142857143,
      "grad_norm": 0.2308729588985443,
      "kl": 0.1011962890625,
      "learning_rate": 1.9733794420337214e-05,
      "loss": 0.0567,
      "reward": 1.7219876870512962,
      "reward_std": 0.4820314012467861,
      "rewards/cosine_scaled_reward": 0.40266046952456236,
      "rewards/format_reward": 0.9166666865348816,
      "step": 405
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2541.2916870117188,
      "epoch": 0.232,
      "grad_norm": 0.33757075667381287,
      "kl": 0.1832275390625,
      "learning_rate": 1.9539516087697518e-05,
      "loss": 0.0846,
      "reward": 0.8487504161894321,
      "reward_std": 0.7889838367700577,
      "rewards/cosine_scaled_reward": 0.007708512246608734,
      "rewards/format_reward": 0.833333358168602,
      "step": 406
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1872.5000457763672,
      "epoch": 0.23257142857142857,
      "grad_norm": 0.1827789694070816,
      "kl": 0.206634521484375,
      "learning_rate": 1.9346966049016424e-05,
      "loss": 0.1478,
      "reward": 1.3279360756278038,
      "reward_std": 0.7395913898944855,
      "rewards/cosine_scaled_reward": 0.2056347131729126,
      "rewards/format_reward": 0.9166666716337204,
      "step": 407
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1630.0834045410156,
      "epoch": 0.23314285714285715,
      "grad_norm": 0.2044801563024521,
      "kl": 0.2037353515625,
      "learning_rate": 1.915615368891117e-05,
      "loss": 0.064,
      "reward": 1.0793461948633194,
      "reward_std": 0.6011250354349613,
      "rewards/cosine_scaled_reward": 0.0813397541642189,
      "rewards/format_reward": 0.9166666716337204,
      "step": 408
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2164.000030517578,
      "epoch": 0.2337142857142857,
      "grad_norm": 0.1898968368768692,
      "kl": 0.2315673828125,
      "learning_rate": 1.8967088307307003e-05,
      "loss": 0.0842,
      "reward": 0.5218268632888794,
      "reward_std": 0.8983562588691711,
      "rewards/cosine_scaled_reward": -0.13491991348564625,
      "rewards/format_reward": 0.7916666716337204,
      "step": 409
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2933.916748046875,
      "epoch": 0.2342857142857143,
      "grad_norm": 0.18983033299446106,
      "kl": 0.38818359375,
      "learning_rate": 1.877977911898387e-05,
      "loss": 0.0487,
      "reward": 0.6665925020352006,
      "reward_std": 0.4128606617450714,
      "rewards/cosine_scaled_reward": 0.020796246826648712,
      "rewards/format_reward": 0.625,
      "step": 410
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2553.1250610351562,
      "epoch": 0.23485714285714285,
      "grad_norm": 0.31600725650787354,
      "kl": 0.187255859375,
      "learning_rate": 1.8594235253127375e-05,
      "loss": 0.0502,
      "reward": 0.7985327839851379,
      "reward_std": 0.7022458389401436,
      "rewards/cosine_scaled_reward": -0.07990030199289322,
      "rewards/format_reward": 0.9583333432674408,
      "step": 411
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2516.3334350585938,
      "epoch": 0.23542857142857143,
      "grad_norm": 0.48707082867622375,
      "kl": 0.3245849609375,
      "learning_rate": 1.8410465752883758e-05,
      "loss": -0.0511,
      "reward": 0.08879928779788315,
      "reward_std": 0.36526790633797646,
      "rewards/cosine_scaled_reward": -0.3306003734469414,
      "rewards/format_reward": 0.7500000074505806,
      "step": 412
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2167.3751068115234,
      "epoch": 0.236,
      "grad_norm": 0.20420606434345245,
      "kl": 0.11083984375,
      "learning_rate": 1.822847957491922e-05,
      "loss": 0.028,
      "reward": 1.2763259708881378,
      "reward_std": 0.8057107403874397,
      "rewards/cosine_scaled_reward": 0.13816297985613346,
      "rewards/format_reward": 1.0,
      "step": 413
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2270.7500915527344,
      "epoch": 0.23657142857142857,
      "grad_norm": 0.30266261100769043,
      "kl": 0.17315673828125,
      "learning_rate": 1.804828558898332e-05,
      "loss": -0.0628,
      "reward": 0.9188649505376816,
      "reward_std": 0.6045625507831573,
      "rewards/cosine_scaled_reward": 0.042765818536281586,
      "rewards/format_reward": 0.8333333432674408,
      "step": 414
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2395.0000762939453,
      "epoch": 0.23714285714285716,
      "grad_norm": 0.19137543439865112,
      "kl": 0.248870849609375,
      "learning_rate": 1.7869892577476724e-05,
      "loss": 0.0178,
      "reward": 0.5630744565278292,
      "reward_std": 0.5253113936632872,
      "rewards/cosine_scaled_reward": -0.13512944988906384,
      "rewards/format_reward": 0.8333333432674408,
      "step": 415
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2648.6250610351562,
      "epoch": 0.2377142857142857,
      "grad_norm": 0.3866080641746521,
      "kl": 0.272674560546875,
      "learning_rate": 1.769330923502313e-05,
      "loss": 0.0603,
      "reward": 0.532728798687458,
      "reward_std": 0.5860278196632862,
      "rewards/cosine_scaled_reward": -0.17113562487065792,
      "rewards/format_reward": 0.8750000149011612,
      "step": 416
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1926.833396911621,
      "epoch": 0.2382857142857143,
      "grad_norm": 0.28291311860084534,
      "kl": 0.1495361328125,
      "learning_rate": 1.7518544168045525e-05,
      "loss": 0.2172,
      "reward": 0.8828543275594711,
      "reward_std": 0.4056566394865513,
      "rewards/cosine_scaled_reward": -0.016906175762414932,
      "rewards/format_reward": 0.9166666865348816,
      "step": 417
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2516.6250610351562,
      "epoch": 0.23885714285714285,
      "grad_norm": 0.26306572556495667,
      "kl": 0.2476806640625,
      "learning_rate": 1.734560589434673e-05,
      "loss": 0.0767,
      "reward": 0.2381377201527357,
      "reward_std": 0.3217194452881813,
      "rewards/cosine_scaled_reward": -0.33926449716091156,
      "rewards/format_reward": 0.9166666865348816,
      "step": 418
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2267.3750915527344,
      "epoch": 0.23942857142857144,
      "grad_norm": 0.16968634724617004,
      "kl": 0.29937744140625,
      "learning_rate": 1.7174502842694213e-05,
      "loss": -0.0294,
      "reward": 0.15903769060969353,
      "reward_std": 0.6953174918889999,
      "rewards/cosine_scaled_reward": -0.2954811677336693,
      "rewards/format_reward": 0.7500000074505806,
      "step": 419
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1356.208366394043,
      "epoch": 0.24,
      "grad_norm": 0.16105955839157104,
      "kl": 0.1409912109375,
      "learning_rate": 1.7005243352409334e-05,
      "loss": 0.0785,
      "reward": 1.5586809143424034,
      "reward_std": 0.6286041433922946,
      "rewards/cosine_scaled_reward": 0.3210071250796318,
      "rewards/format_reward": 0.9166666716337204,
      "step": 420
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2299.0833740234375,
      "epoch": 0.24057142857142857,
      "grad_norm": 0.24983569979667664,
      "kl": 0.207275390625,
      "learning_rate": 1.6837835672960835e-05,
      "loss": 0.0504,
      "reward": 0.802627682685852,
      "reward_std": 0.7565935924649239,
      "rewards/cosine_scaled_reward": -0.057019513100385666,
      "rewards/format_reward": 0.9166666865348816,
      "step": 421
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3433.166748046875,
      "epoch": 0.24114285714285713,
      "grad_norm": 0.26617732644081116,
      "kl": 0.4814453125,
      "learning_rate": 1.6672287963562855e-05,
      "loss": 0.029,
      "reward": 0.4502560719847679,
      "reward_std": 0.8340122252702713,
      "rewards/cosine_scaled_reward": -0.14987197145819664,
      "rewards/format_reward": 0.7500000149011612,
      "step": 422
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1953.916732788086,
      "epoch": 0.24171428571428571,
      "grad_norm": 0.23757286369800568,
      "kl": 0.172607421875,
      "learning_rate": 1.6508608292777204e-05,
      "loss": -0.0152,
      "reward": 0.899025060236454,
      "reward_std": 0.9901894629001617,
      "rewards/cosine_scaled_reward": -0.008820809423923492,
      "rewards/format_reward": 0.9166666865348816,
      "step": 423
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2103.3334045410156,
      "epoch": 0.2422857142857143,
      "grad_norm": 0.7382405400276184,
      "kl": 0.1934814453125,
      "learning_rate": 1.63468046381201e-05,
      "loss": 0.0604,
      "reward": 0.9285087138414383,
      "reward_std": 0.8037227541208267,
      "rewards/cosine_scaled_reward": 0.026754358783364296,
      "rewards/format_reward": 0.8750000298023224,
      "step": 424
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3309.3334350585938,
      "epoch": 0.24285714285714285,
      "grad_norm": 0.3841143250465393,
      "kl": 0.486083984375,
      "learning_rate": 1.6186884885673413e-05,
      "loss": 0.0301,
      "reward": 0.1549401180818677,
      "reward_std": 0.38981814309954643,
      "rewards/cosine_scaled_reward": -0.2766966111958027,
      "rewards/format_reward": 0.7083333507180214,
      "step": 425
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2825.0000610351562,
      "epoch": 0.24342857142857144,
      "grad_norm": 0.22417770326137543,
      "kl": 0.32391357421875,
      "learning_rate": 1.602885682970026e-05,
      "loss": 0.0358,
      "reward": 0.6782936668023467,
      "reward_std": 0.5605045408010483,
      "rewards/cosine_scaled_reward": -0.03585319593548775,
      "rewards/format_reward": 0.7500000149011612,
      "step": 426
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3096.291748046875,
      "epoch": 0.244,
      "grad_norm": 0.21399366855621338,
      "kl": 0.3536376953125,
      "learning_rate": 1.5872728172265147e-05,
      "loss": 0.0608,
      "reward": 0.7031284123659134,
      "reward_std": 0.7477234750986099,
      "rewards/cosine_scaled_reward": -0.04426916316151619,
      "rewards/format_reward": 0.7916666865348816,
      "step": 427
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1948.2916870117188,
      "epoch": 0.24457142857142858,
      "grad_norm": 0.27849727869033813,
      "kl": 0.208099365234375,
      "learning_rate": 1.5718506522858573e-05,
      "loss": 0.0685,
      "reward": 1.0386689975857735,
      "reward_std": 0.9889701455831528,
      "rewards/cosine_scaled_reward": 0.1026678173802793,
      "rewards/format_reward": 0.8333333432674408,
      "step": 428
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2656.875030517578,
      "epoch": 0.24514285714285713,
      "grad_norm": 0.2026260942220688,
      "kl": 0.36163330078125,
      "learning_rate": 1.556619939802615e-05,
      "loss": 0.0058,
      "reward": 0.3719893768429756,
      "reward_std": 0.768018901348114,
      "rewards/cosine_scaled_reward": -0.1890053153038025,
      "rewards/format_reward": 0.7500000074505806,
      "step": 429
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3101.416748046875,
      "epoch": 0.24571428571428572,
      "grad_norm": 0.3154773414134979,
      "kl": 0.255615234375,
      "learning_rate": 1.5415814221002267e-05,
      "loss": 0.0772,
      "reward": 0.6607606410980225,
      "reward_std": 0.9749536961317062,
      "rewards/cosine_scaled_reward": -0.10711969062685966,
      "rewards/format_reward": 0.8750000298023224,
      "step": 430
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3123.3750610351562,
      "epoch": 0.24628571428571427,
      "grad_norm": 0.22464747726917267,
      "kl": 0.48779296875,
      "learning_rate": 1.526735832134829e-05,
      "loss": 0.088,
      "reward": 0.6580713596194983,
      "reward_std": 0.521630696952343,
      "rewards/cosine_scaled_reward": -0.06679766997694969,
      "rewards/format_reward": 0.7916666716337204,
      "step": 431
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2067.7083892822266,
      "epoch": 0.24685714285714286,
      "grad_norm": 0.22717218101024628,
      "kl": 0.16131591796875,
      "learning_rate": 1.5120838934595339e-05,
      "loss": -0.0427,
      "reward": 0.5478418692946434,
      "reward_std": 0.8848011568188667,
      "rewards/cosine_scaled_reward": -0.1427457444369793,
      "rewards/format_reward": 0.833333358168602,
      "step": 432
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1642.3334045410156,
      "epoch": 0.24742857142857144,
      "grad_norm": 0.24554996192455292,
      "kl": 0.128936767578125,
      "learning_rate": 1.4976263201891614e-05,
      "loss": -0.0574,
      "reward": 1.2519729286432266,
      "reward_std": 0.6352694146335125,
      "rewards/cosine_scaled_reward": 0.1468198113143444,
      "rewards/format_reward": 0.9583333432674408,
      "step": 433
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3017.0834350585938,
      "epoch": 0.248,
      "grad_norm": 0.23111708462238312,
      "kl": 0.3212890625,
      "learning_rate": 1.4833638169654352e-05,
      "loss": 0.0242,
      "reward": 0.4378589540719986,
      "reward_std": 0.47337810695171356,
      "rewards/cosine_scaled_reward": -0.1977371945977211,
      "rewards/format_reward": 0.8333333432674408,
      "step": 434
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2320.666732788086,
      "epoch": 0.24857142857142858,
      "grad_norm": 0.18372684717178345,
      "kl": 0.31683349609375,
      "learning_rate": 1.469297078922642e-05,
      "loss": 0.122,
      "reward": 0.9090068517252803,
      "reward_std": 0.4904613792896271,
      "rewards/cosine_scaled_reward": 0.10033675655722618,
      "rewards/format_reward": 0.7083333507180214,
      "step": 435
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2480.87508392334,
      "epoch": 0.24914285714285714,
      "grad_norm": 0.36884191632270813,
      "kl": 0.284912109375,
      "learning_rate": 1.4554267916537495e-05,
      "loss": 0.1089,
      "reward": 0.9375562369823456,
      "reward_std": 1.2034636735916138,
      "rewards/cosine_scaled_reward": 0.07294480130076408,
      "rewards/format_reward": 0.7916666865348816,
      "step": 436
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1246.0000381469727,
      "epoch": 0.24971428571428572,
      "grad_norm": 0.5064862966537476,
      "kl": 0.128814697265625,
      "learning_rate": 1.4417536311769886e-05,
      "loss": 0.0032,
      "reward": 1.3162876069545746,
      "reward_std": 0.5200040265917778,
      "rewards/cosine_scaled_reward": 0.199810478836298,
      "rewards/format_reward": 0.9166666716337204,
      "step": 437
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2604.0834045410156,
      "epoch": 0.2502857142857143,
      "grad_norm": 0.15203051269054413,
      "kl": 0.22100830078125,
      "learning_rate": 1.428278263902913e-05,
      "loss": 0.0638,
      "reward": 0.6659852899610996,
      "reward_std": 0.7454147860407829,
      "rewards/cosine_scaled_reward": -0.10450738109648228,
      "rewards/format_reward": 0.8750000149011612,
      "step": 438
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1844.3750915527344,
      "epoch": 0.25085714285714283,
      "grad_norm": 0.16812385618686676,
      "kl": 0.21405029296875,
      "learning_rate": 1.4150013466019115e-05,
      "loss": 0.0654,
      "reward": 0.451425364241004,
      "reward_std": 0.5447552353143692,
      "rewards/cosine_scaled_reward": -0.21178732067346573,
      "rewards/format_reward": 0.8750000149011612,
      "step": 439
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1738.7500305175781,
      "epoch": 0.25142857142857145,
      "grad_norm": 0.25210797786712646,
      "kl": 0.2158203125,
      "learning_rate": 1.4019235263722036e-05,
      "loss": -0.0197,
      "reward": 0.3830821365118027,
      "reward_std": 0.4169432930648327,
      "rewards/cosine_scaled_reward": -0.24595895409584045,
      "rewards/format_reward": 0.8750000149011612,
      "step": 440
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2879.2501220703125,
      "epoch": 0.252,
      "grad_norm": 0.4387795925140381,
      "kl": 0.404296875,
      "learning_rate": 1.389045440608296e-05,
      "loss": 0.0095,
      "reward": 0.17910403199493885,
      "reward_std": 0.450714360922575,
      "rewards/cosine_scaled_reward": -0.28544800728559494,
      "rewards/format_reward": 0.7500000149011612,
      "step": 441
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2282.2501068115234,
      "epoch": 0.25257142857142856,
      "grad_norm": 0.2824661433696747,
      "kl": 0.2359619140625,
      "learning_rate": 1.3763677169699218e-05,
      "loss": 0.016,
      "reward": 1.4356295093894005,
      "reward_std": 0.3898382596671581,
      "rewards/cosine_scaled_reward": 0.28031472861766815,
      "rewards/format_reward": 0.875,
      "step": 442
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2792.0416870117188,
      "epoch": 0.25314285714285717,
      "grad_norm": 0.2786317765712738,
      "kl": 0.376953125,
      "learning_rate": 1.3638909733514454e-05,
      "loss": -0.0293,
      "reward": 0.19937769044190645,
      "reward_std": 0.5115105733275414,
      "rewards/cosine_scaled_reward": -0.2336445041000843,
      "rewards/format_reward": 0.6666666828095913,
      "step": 443
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2881.7500610351562,
      "epoch": 0.2537142857142857,
      "grad_norm": 0.19312676787376404,
      "kl": 0.377685546875,
      "learning_rate": 1.3516158178517482e-05,
      "loss": 0.0486,
      "reward": 0.3308720774948597,
      "reward_std": 0.581496886909008,
      "rewards/cosine_scaled_reward": -0.20956396497786045,
      "rewards/format_reward": 0.7500000149011612,
      "step": 444
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2190.1251068115234,
      "epoch": 0.2542857142857143,
      "grad_norm": 0.3452683091163635,
      "kl": 0.161651611328125,
      "learning_rate": 1.3395428487445916e-05,
      "loss": -0.0681,
      "reward": 1.0236865878105164,
      "reward_std": 0.8273026198148727,
      "rewards/cosine_scaled_reward": 0.07434329763054848,
      "rewards/format_reward": 0.875,
      "step": 445
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1811.2500305175781,
      "epoch": 0.25485714285714284,
      "grad_norm": 0.2189359962940216,
      "kl": 0.183319091796875,
      "learning_rate": 1.3276726544494572e-05,
      "loss": 0.0868,
      "reward": 0.420175077393651,
      "reward_std": 0.2947616521269083,
      "rewards/cosine_scaled_reward": -0.24824582412838936,
      "rewards/format_reward": 0.9166666865348816,
      "step": 446
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2670.6250610351562,
      "epoch": 0.25542857142857145,
      "grad_norm": 0.21349620819091797,
      "kl": 0.27740478515625,
      "learning_rate": 1.3160058135028691e-05,
      "loss": 0.0593,
      "reward": 0.64484803378582,
      "reward_std": 0.8606602028012276,
      "rewards/cosine_scaled_reward": -0.11507599544711411,
      "rewards/format_reward": 0.8750000149011612,
      "step": 447
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2424.8334350585938,
      "epoch": 0.256,
      "grad_norm": 0.23509125411510468,
      "kl": 0.2618408203125,
      "learning_rate": 1.3045428945301954e-05,
      "loss": 0.0366,
      "reward": 1.0526692494750023,
      "reward_std": 0.46373073756694794,
      "rewards/cosine_scaled_reward": 0.13050128147006035,
      "rewards/format_reward": 0.7916666865348816,
      "step": 448
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2667.2084350585938,
      "epoch": 0.25657142857142856,
      "grad_norm": 0.39572006464004517,
      "kl": 0.258056640625,
      "learning_rate": 1.2932844562179353e-05,
      "loss": 0.0822,
      "reward": 1.0360109135508537,
      "reward_std": 0.7895313426852226,
      "rewards/cosine_scaled_reward": 0.18467211723327637,
      "rewards/format_reward": 0.6666666828095913,
      "step": 449
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1222.3750762939453,
      "epoch": 0.2571428571428571,
      "grad_norm": 0.18777267634868622,
      "kl": 0.132415771484375,
      "learning_rate": 1.2822310472864884e-05,
      "loss": 0.1176,
      "reward": 1.2537522641941905,
      "reward_std": 0.40848940052092075,
      "rewards/cosine_scaled_reward": 0.1893761195242405,
      "rewards/format_reward": 0.875,
      "step": 450
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2296.0000915527344,
      "epoch": 0.25771428571428573,
      "grad_norm": 0.3156144618988037,
      "kl": 0.206787109375,
      "learning_rate": 1.2713832064634126e-05,
      "loss": 0.1219,
      "reward": 0.5749104283750057,
      "reward_std": 0.6167091354727745,
      "rewards/cosine_scaled_reward": -0.21254480443894863,
      "rewards/format_reward": 1.0,
      "step": 451
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1653.2500686645508,
      "epoch": 0.2582857142857143,
      "grad_norm": 0.4651844799518585,
      "kl": 0.131317138671875,
      "learning_rate": 1.260741462457165e-05,
      "loss": 0.133,
      "reward": 1.0920193493366241,
      "reward_std": 0.3826703876256943,
      "rewards/cosine_scaled_reward": 0.04600968584418297,
      "rewards/format_reward": 1.0,
      "step": 452
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2377.666717529297,
      "epoch": 0.25885714285714284,
      "grad_norm": 0.2226954698562622,
      "kl": 0.1348876953125,
      "learning_rate": 1.2503063339313356e-05,
      "loss": 0.0658,
      "reward": 0.6897746287286282,
      "reward_std": 0.6120940484106541,
      "rewards/cosine_scaled_reward": -0.07177936565130949,
      "rewards/format_reward": 0.8333333358168602,
      "step": 453
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2628.2500610351562,
      "epoch": 0.25942857142857145,
      "grad_norm": 0.19401276111602783,
      "kl": 0.25439453125,
      "learning_rate": 1.240078329479367e-05,
      "loss": 0.0921,
      "reward": 1.017133679240942,
      "reward_std": 0.8067431151866913,
      "rewards/cosine_scaled_reward": 0.1335668321698904,
      "rewards/format_reward": 0.7500000149011612,
      "step": 454
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2419.7083740234375,
      "epoch": 0.26,
      "grad_norm": 0.4960789680480957,
      "kl": 0.21307373046875,
      "learning_rate": 1.2300579475997657e-05,
      "loss": 0.1634,
      "reward": 0.6709480434656143,
      "reward_std": 0.7460784912109375,
      "rewards/cosine_scaled_reward": -0.08119264617562294,
      "rewards/format_reward": 0.833333358168602,
      "step": 455
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3323.3751220703125,
      "epoch": 0.26057142857142856,
      "grad_norm": 0.33247068524360657,
      "kl": 0.4267578125,
      "learning_rate": 1.2202456766718093e-05,
      "loss": 0.0369,
      "reward": 0.37625838071107864,
      "reward_std": 0.7378395646810532,
      "rewards/cosine_scaled_reward": -0.16603748872876167,
      "rewards/format_reward": 0.7083333507180214,
      "step": 456
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2653.0833740234375,
      "epoch": 0.2611428571428571,
      "grad_norm": 0.27788954973220825,
      "kl": 0.3778076171875,
      "learning_rate": 1.210641994931739e-05,
      "loss": 0.0299,
      "reward": 0.7535701543092728,
      "reward_std": 1.0977338403463364,
      "rewards/cosine_scaled_reward": 0.0017850752919912338,
      "rewards/format_reward": 0.7500000149011612,
      "step": 457
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2128.0000610351562,
      "epoch": 0.26171428571428573,
      "grad_norm": 0.23413938283920288,
      "kl": 0.14739990234375,
      "learning_rate": 1.2012473704494538e-05,
      "loss": 0.0325,
      "reward": 0.5362795293331146,
      "reward_std": 0.3089091796427965,
      "rewards/cosine_scaled_reward": -0.21102692931890488,
      "rewards/format_reward": 0.9583333432674408,
      "step": 458
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2767.3750610351562,
      "epoch": 0.2622857142857143,
      "grad_norm": 0.22002078592777252,
      "kl": 0.2928466796875,
      "learning_rate": 1.1920622611056975e-05,
      "loss": 0.0267,
      "reward": 1.019486054778099,
      "reward_std": 0.9020187258720398,
      "rewards/cosine_scaled_reward": 0.05140970088541508,
      "rewards/format_reward": 0.9166666716337204,
      "step": 459
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2332.416778564453,
      "epoch": 0.26285714285714284,
      "grad_norm": 0.33746153116226196,
      "kl": 0.23980712890625,
      "learning_rate": 1.1830871145697413e-05,
      "loss": 0.1162,
      "reward": 1.126957267522812,
      "reward_std": 0.5669834688305855,
      "rewards/cosine_scaled_reward": 0.08431193931028247,
      "rewards/format_reward": 0.9583333432674408,
      "step": 460
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2162.8333740234375,
      "epoch": 0.2634285714285714,
      "grad_norm": 0.5436040759086609,
      "kl": 0.21539306640625,
      "learning_rate": 1.174322368277565e-05,
      "loss": 0.1728,
      "reward": 0.7292355694808066,
      "reward_std": 0.5752151645720005,
      "rewards/cosine_scaled_reward": -0.09371556341648102,
      "rewards/format_reward": 0.9166666716337204,
      "step": 461
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3241.666748046875,
      "epoch": 0.264,
      "grad_norm": 0.27894335985183716,
      "kl": 0.4532470703125,
      "learning_rate": 1.1657684494105387e-05,
      "loss": 0.0877,
      "reward": 0.5649700947105885,
      "reward_std": 0.8659732490777969,
      "rewards/cosine_scaled_reward": -0.05084826331585646,
      "rewards/format_reward": 0.6666666865348816,
      "step": 462
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3078.8750610351562,
      "epoch": 0.26457142857142857,
      "grad_norm": 0.2687225043773651,
      "kl": 0.2301025390625,
      "learning_rate": 1.1574257748745986e-05,
      "loss": 0.0225,
      "reward": 1.2971481904387474,
      "reward_std": 1.1507280617952347,
      "rewards/cosine_scaled_reward": 0.2110740765929222,
      "rewards/format_reward": 0.8750000298023224,
      "step": 463
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2286.7500915527344,
      "epoch": 0.2651428571428571,
      "grad_norm": 0.17512497305870056,
      "kl": 0.220703125,
      "learning_rate": 1.149294751279933e-05,
      "loss": -0.0102,
      "reward": 0.6485597789287567,
      "reward_std": 0.6553373290225863,
      "rewards/cosine_scaled_reward": -0.11322011891752481,
      "rewards/format_reward": 0.8750000149011612,
      "step": 464
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2979.9584350585938,
      "epoch": 0.26571428571428574,
      "grad_norm": 0.2867068648338318,
      "kl": 0.391845703125,
      "learning_rate": 1.1413757749211602e-05,
      "loss": 0.0628,
      "reward": 0.7631555460393429,
      "reward_std": 0.9646207839250565,
      "rewards/cosine_scaled_reward": 0.027411112561821938,
      "rewards/format_reward": 0.7083333432674408,
      "step": 465
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2141.0834350585938,
      "epoch": 0.2662857142857143,
      "grad_norm": 0.3626898527145386,
      "kl": 0.17266845703125,
      "learning_rate": 1.133669231758016e-05,
      "loss": 0.0811,
      "reward": 1.0105885118246078,
      "reward_std": 0.7124912440776825,
      "rewards/cosine_scaled_reward": 0.026127580553293228,
      "rewards/format_reward": 0.9583333432674408,
      "step": 466
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2783.666748046875,
      "epoch": 0.26685714285714285,
      "grad_norm": 0.4113839566707611,
      "kl": 0.309814453125,
      "learning_rate": 1.1261754973965422e-05,
      "loss": -0.0117,
      "reward": 0.3410836011171341,
      "reward_std": 0.5675521939992905,
      "rewards/cosine_scaled_reward": -0.22529152780771255,
      "rewards/format_reward": 0.7916666679084301,
      "step": 467
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2199.7084045410156,
      "epoch": 0.2674285714285714,
      "grad_norm": 0.2993246614933014,
      "kl": 0.20440673828125,
      "learning_rate": 1.1188949370707787e-05,
      "loss": 0.0531,
      "reward": 0.8006970658898354,
      "reward_std": 0.7448753714561462,
      "rewards/cosine_scaled_reward": -0.057984789134934545,
      "rewards/format_reward": 0.9166666865348816,
      "step": 468
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2270.166717529297,
      "epoch": 0.268,
      "grad_norm": 0.2324548065662384,
      "kl": 0.23199462890625,
      "learning_rate": 1.1118279056249655e-05,
      "loss": -0.0361,
      "reward": 0.4024841138161719,
      "reward_std": 0.4483271986246109,
      "rewards/cosine_scaled_reward": -0.23625795356929302,
      "rewards/format_reward": 0.875,
      "step": 469
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1424.6666946411133,
      "epoch": 0.26857142857142857,
      "grad_norm": 0.21465709805488586,
      "kl": 0.157196044921875,
      "learning_rate": 1.1049747474962445e-05,
      "loss": 0.1015,
      "reward": 1.568716924637556,
      "reward_std": 0.24283705279231071,
      "rewards/cosine_scaled_reward": 0.32602518051862717,
      "rewards/format_reward": 0.9166666865348816,
      "step": 470
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2525.666732788086,
      "epoch": 0.26914285714285713,
      "grad_norm": 0.22672057151794434,
      "kl": 0.36737060546875,
      "learning_rate": 1.0983357966978745e-05,
      "loss": 0.0491,
      "reward": 0.396379379555583,
      "reward_std": 0.629085049033165,
      "rewards/cosine_scaled_reward": -0.21847698092460632,
      "rewards/format_reward": 0.8333333432674408,
      "step": 471
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2505.666748046875,
      "epoch": 0.26971428571428574,
      "grad_norm": 0.2537076771259308,
      "kl": 0.24371337890625,
      "learning_rate": 1.0919113768029518e-05,
      "loss": -0.0105,
      "reward": 0.5954728499054909,
      "reward_std": 0.3647001665085554,
      "rewards/cosine_scaled_reward": -0.16059689968824387,
      "rewards/format_reward": 0.9166666716337204,
      "step": 472
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2161.3334350585938,
      "epoch": 0.2702857142857143,
      "grad_norm": 0.14275974035263062,
      "kl": 0.173126220703125,
      "learning_rate": 1.0857018009286382e-05,
      "loss": 0.0868,
      "reward": 0.9977487437427044,
      "reward_std": 0.6313546672463417,
      "rewards/cosine_scaled_reward": 0.06137435883283615,
      "rewards/format_reward": 0.875,
      "step": 473
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2200.2916870117188,
      "epoch": 0.27085714285714285,
      "grad_norm": 0.2012917548418045,
      "kl": 0.238983154296875,
      "learning_rate": 1.0797073717209014e-05,
      "loss": 0.0855,
      "reward": 0.5041792467236519,
      "reward_std": 0.5631431899964809,
      "rewards/cosine_scaled_reward": -0.16457706969231367,
      "rewards/format_reward": 0.833333358168602,
      "step": 474
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2275.916732788086,
      "epoch": 0.2714285714285714,
      "grad_norm": 0.39984893798828125,
      "kl": 0.262451171875,
      "learning_rate": 1.0739283813397639e-05,
      "loss": 0.1526,
      "reward": 1.0086217746138573,
      "reward_std": 1.0287954062223434,
      "rewards/cosine_scaled_reward": 0.08764421939849854,
      "rewards/format_reward": 0.833333358168602,
      "step": 475
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2965.416748046875,
      "epoch": 0.272,
      "grad_norm": 0.24678374826908112,
      "kl": 0.3681640625,
      "learning_rate": 1.0683651114450641e-05,
      "loss": 0.0372,
      "reward": 0.4287208868190646,
      "reward_std": 0.7793765217065811,
      "rewards/cosine_scaled_reward": -0.20230622496455908,
      "rewards/format_reward": 0.833333358168602,
      "step": 476
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2816.041748046875,
      "epoch": 0.2725714285714286,
      "grad_norm": 0.23802922666072845,
      "kl": 0.2159423828125,
      "learning_rate": 1.0630178331827282e-05,
      "loss": 0.0086,
      "reward": 0.5022896975278854,
      "reward_std": 0.682855635881424,
      "rewards/cosine_scaled_reward": -0.14468851312994957,
      "rewards/format_reward": 0.7916666865348816,
      "step": 477
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3076.6251220703125,
      "epoch": 0.27314285714285713,
      "grad_norm": 0.48291072249412537,
      "kl": 0.40087890625,
      "learning_rate": 1.0578868071715544e-05,
      "loss": 0.0037,
      "reward": 0.7402574494481087,
      "reward_std": 0.9319310411810875,
      "rewards/cosine_scaled_reward": -0.004871279001235962,
      "rewards/format_reward": 0.7500000149011612,
      "step": 478
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2252.5417098999023,
      "epoch": 0.2737142857142857,
      "grad_norm": 0.42745697498321533,
      "kl": 0.185760498046875,
      "learning_rate": 1.0529722834905126e-05,
      "loss": 0.059,
      "reward": 1.4727585576474667,
      "reward_std": 0.44060441479086876,
      "rewards/cosine_scaled_reward": 0.3197126239538193,
      "rewards/format_reward": 0.8333333432674408,
      "step": 479
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3045.291748046875,
      "epoch": 0.2742857142857143,
      "grad_norm": 0.2685858905315399,
      "kl": 0.316650390625,
      "learning_rate": 1.0482745016665526e-05,
      "loss": 0.0343,
      "reward": 1.3959271758794785,
      "reward_std": 0.8586903437972069,
      "rewards/cosine_scaled_reward": 0.21879691816866398,
      "rewards/format_reward": 0.9583333432674408,
      "step": 480
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2271.4167098999023,
      "epoch": 0.27485714285714286,
      "grad_norm": 0.26521894335746765,
      "kl": 0.2750244140625,
      "learning_rate": 1.0437936906629336e-05,
      "loss": 0.0158,
      "reward": 0.7311921007931232,
      "reward_std": 0.8896900117397308,
      "rewards/cosine_scaled_reward": -0.05107062542811036,
      "rewards/format_reward": 0.8333333432674408,
      "step": 481
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2437.5000610351562,
      "epoch": 0.2754285714285714,
      "grad_norm": 0.19190503656864166,
      "kl": 0.275604248046875,
      "learning_rate": 1.0395300688680626e-05,
      "loss": 0.045,
      "reward": 0.9152816236019135,
      "reward_std": 0.8774067535996437,
      "rewards/cosine_scaled_reward": 0.061807457357645035,
      "rewards/format_reward": 0.7916666865348816,
      "step": 482
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1939.6250610351562,
      "epoch": 0.276,
      "grad_norm": 0.29819318652153015,
      "kl": 0.2987060546875,
      "learning_rate": 1.0354838440848503e-05,
      "loss": -0.0064,
      "reward": 0.9426087737083435,
      "reward_std": 0.949207216501236,
      "rewards/cosine_scaled_reward": 0.07547104358673096,
      "rewards/format_reward": 0.7916666865348816,
      "step": 483
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2167.9583892822266,
      "epoch": 0.2765714285714286,
      "grad_norm": 0.37711989879608154,
      "kl": 0.29034423828125,
      "learning_rate": 1.0316552135205838e-05,
      "loss": 0.0293,
      "reward": 0.6746074706315994,
      "reward_std": 0.6280665933154523,
      "rewards/cosine_scaled_reward": -0.07936295960098505,
      "rewards/format_reward": 0.8333333432674408,
      "step": 484
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2680.0000610351562,
      "epoch": 0.27714285714285714,
      "grad_norm": 0.35504573583602905,
      "kl": 0.446533203125,
      "learning_rate": 1.0280443637773165e-05,
      "loss": 0.0954,
      "reward": 0.5523308105766773,
      "reward_std": 0.5918973311781883,
      "rewards/cosine_scaled_reward": -0.05716794729232788,
      "rewards/format_reward": 0.6666666865348816,
      "step": 485
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2262.5416870117188,
      "epoch": 0.2777142857142857,
      "grad_norm": 0.18296080827713013,
      "kl": 0.1949462890625,
      "learning_rate": 1.0246514708427702e-05,
      "loss": 0.0443,
      "reward": 0.9830817077308893,
      "reward_std": 0.859332574531436,
      "rewards/cosine_scaled_reward": 0.05404083710163832,
      "rewards/format_reward": 0.8750000149011612,
      "step": 486
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1231.5833587646484,
      "epoch": 0.2782857142857143,
      "grad_norm": 0.18730592727661133,
      "kl": 0.1104736328125,
      "learning_rate": 1.0214767000817597e-05,
      "loss": 0.0454,
      "reward": 1.0873636417090893,
      "reward_std": 0.7174102757126093,
      "rewards/cosine_scaled_reward": 0.06451515853404999,
      "rewards/format_reward": 0.9583333432674408,
      "step": 487
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3112.8750610351562,
      "epoch": 0.27885714285714286,
      "grad_norm": 0.17851535975933075,
      "kl": 0.427734375,
      "learning_rate": 1.0185202062281336e-05,
      "loss": 0.0646,
      "reward": 0.607562929391861,
      "reward_std": 0.9592312499880791,
      "rewards/cosine_scaled_reward": -0.008718553930521011,
      "rewards/format_reward": 0.6250000260770321,
      "step": 488
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2498.3333435058594,
      "epoch": 0.2794285714285714,
      "grad_norm": 0.3319559395313263,
      "kl": 0.241943359375,
      "learning_rate": 1.0157821333772305e-05,
      "loss": -0.0705,
      "reward": 0.541482325643301,
      "reward_std": 0.6799080520868301,
      "rewards/cosine_scaled_reward": -0.16675885394215584,
      "rewards/format_reward": 0.8750000149011612,
      "step": 489
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1838.8750915527344,
      "epoch": 0.28,
      "grad_norm": 0.2813575565814972,
      "kl": 0.18414306640625,
      "learning_rate": 1.0132626149788591e-05,
      "loss": -0.0205,
      "reward": 0.9313252754509449,
      "reward_std": 0.4187740869820118,
      "rewards/cosine_scaled_reward": -0.013504065573215485,
      "rewards/format_reward": 0.9583333432674408,
      "step": 490
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1959.1250762939453,
      "epoch": 0.2805714285714286,
      "grad_norm": 1.0054746866226196,
      "kl": 0.123046875,
      "learning_rate": 1.0109617738307912e-05,
      "loss": 0.3442,
      "reward": 1.3521376699209213,
      "reward_std": 0.4419573098421097,
      "rewards/cosine_scaled_reward": 0.17606881260871887,
      "rewards/format_reward": 1.0,
      "step": 491
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2286.2084350585938,
      "epoch": 0.28114285714285714,
      "grad_norm": 0.35966256260871887,
      "kl": 0.2607421875,
      "learning_rate": 1.008879722072778e-05,
      "loss": 0.0295,
      "reward": 0.5943646021187305,
      "reward_std": 0.751454122364521,
      "rewards/cosine_scaled_reward": -0.07781771570444107,
      "rewards/format_reward": 0.7500000111758709,
      "step": 492
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2124.9583587646484,
      "epoch": 0.2817142857142857,
      "grad_norm": 0.27436164021492004,
      "kl": 0.241302490234375,
      "learning_rate": 1.0070165611810856e-05,
      "loss": -0.019,
      "reward": 1.310558546334505,
      "reward_std": 1.1386958360671997,
      "rewards/cosine_scaled_reward": 0.2386126071214676,
      "rewards/format_reward": 0.8333333432674408,
      "step": 493
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1135.4166870117188,
      "epoch": 0.2822857142857143,
      "grad_norm": 0.217626690864563,
      "kl": 0.07623291015625,
      "learning_rate": 1.0053723819635471e-05,
      "loss": 0.0985,
      "reward": 1.053057461977005,
      "reward_std": 0.5034293830394745,
      "rewards/cosine_scaled_reward": 0.0473620742559433,
      "rewards/format_reward": 0.9583333432674408,
      "step": 494
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2913.5000610351562,
      "epoch": 0.28285714285714286,
      "grad_norm": 0.16135992109775543,
      "kl": 0.2882080078125,
      "learning_rate": 1.0039472645551373e-05,
      "loss": 0.0153,
      "reward": 0.7073401757515967,
      "reward_std": 0.8274732977151871,
      "rewards/cosine_scaled_reward": -0.06299658864736557,
      "rewards/format_reward": 0.833333358168602,
      "step": 495
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1558.0834045410156,
      "epoch": 0.2834285714285714,
      "grad_norm": 0.17040328681468964,
      "kl": 0.181396484375,
      "learning_rate": 1.0027412784140691e-05,
      "loss": 0.0125,
      "reward": 1.1920581981539726,
      "reward_std": 0.9239856712520123,
      "rewards/cosine_scaled_reward": 0.13769576186314225,
      "rewards/format_reward": 0.9166666716337204,
      "step": 496
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2690.166748046875,
      "epoch": 0.284,
      "grad_norm": 0.2827388346195221,
      "kl": 0.3397216796875,
      "learning_rate": 1.0017544823184056e-05,
      "loss": 0.0234,
      "reward": 0.4144871234893799,
      "reward_std": 0.8535003513097763,
      "rewards/cosine_scaled_reward": -0.16775644943118095,
      "rewards/format_reward": 0.7500000149011612,
      "step": 497
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2351.541717529297,
      "epoch": 0.2845714285714286,
      "grad_norm": 0.233955517411232,
      "kl": 0.33514404296875,
      "learning_rate": 1.0009869243631953e-05,
      "loss": 0.0613,
      "reward": 1.0902096033096313,
      "reward_std": 1.2109316736459732,
      "rewards/cosine_scaled_reward": 0.17010477557778358,
      "rewards/format_reward": 0.7500000149011612,
      "step": 498
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2419.2084350585938,
      "epoch": 0.28514285714285714,
      "grad_norm": 0.31629398465156555,
      "kl": 0.232330322265625,
      "learning_rate": 1.000438641958131e-05,
      "loss": 0.1232,
      "reward": 1.0896388813853264,
      "reward_std": 1.0943061411380768,
      "rewards/cosine_scaled_reward": 0.1073194369673729,
      "rewards/format_reward": 0.8750000149011612,
      "step": 499
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1832.9167022705078,
      "epoch": 0.2857142857142857,
      "grad_norm": 0.27818962931632996,
      "kl": 0.27166748046875,
      "learning_rate": 1.0001096618257236e-05,
      "loss": 0.0069,
      "reward": 0.625803031027317,
      "reward_std": 0.7328432351350784,
      "rewards/cosine_scaled_reward": -0.08293185429647565,
      "rewards/format_reward": 0.7916666679084301,
      "step": 500
    },
    {
      "epoch": 0.2857142857142857,
      "step": 500,
      "total_flos": 0.0,
      "train_loss": 0.051498750954982825,
      "train_runtime": 24099.439,
      "train_samples_per_second": 0.498,
      "train_steps_per_second": 0.021
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 6,
  "trial_name": null,
  "trial_params": null
}