{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.0,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 2822.7857666015625,
      "epoch": 0.004,
      "grad_norm": 0.12564538419246674,
      "kl": 0.0,
      "learning_rate": 2e-08,
      "loss": 0.0645,
      "reward": 0.09580668434500694,
      "reward_std": 0.5702872574329376,
      "rewards/cosine_scaled_reward": -0.14554904401302338,
      "rewards/format_reward": 0.3869047649204731,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2575.571533203125,
      "epoch": 0.008,
      "grad_norm": 0.15411853790283203,
      "kl": 0.0,
      "learning_rate": 4e-08,
      "loss": 0.0717,
      "reward": 0.5743008255958557,
      "reward_std": 0.7826777100563049,
      "rewards/cosine_scaled_reward": 0.03119804011657834,
      "rewards/format_reward": 0.5119047686457634,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2762.1190490722656,
      "epoch": 0.012,
      "grad_norm": 0.13477382063865662,
      "kl": 3.463029861450195e-05,
      "learning_rate": 6e-08,
      "loss": 0.0865,
      "reward": 0.21700193732976913,
      "reward_std": 0.6844624578952789,
      "rewards/cosine_scaled_reward": -0.10578475520014763,
      "rewards/format_reward": 0.4285714402794838,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2686.3214721679688,
      "epoch": 0.016,
      "grad_norm": 0.1282820850610733,
      "kl": 2.434849739074707e-05,
      "learning_rate": 8e-08,
      "loss": 0.0525,
      "reward": 0.4696298725903034,
      "reward_std": 0.7235232815146446,
      "rewards/cosine_scaled_reward": -0.0062565067782998085,
      "rewards/format_reward": 0.4821428582072258,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2917.5535888671875,
      "epoch": 0.02,
      "grad_norm": 0.14993517100811005,
      "kl": 3.725290298461914e-05,
      "learning_rate": 1e-07,
      "loss": 0.0762,
      "reward": 0.15318153076805174,
      "reward_std": 0.7213103845715523,
      "rewards/cosine_scaled_reward": -0.08412353717721999,
      "rewards/format_reward": 0.3214285857975483,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2816.2559814453125,
      "epoch": 0.024,
      "grad_norm": 0.14960958063602448,
      "kl": 3.1054019927978516e-05,
      "learning_rate": 1.2e-07,
      "loss": 0.0537,
      "reward": 0.2950221598148346,
      "reward_std": 0.738863505423069,
      "rewards/cosine_scaled_reward": -0.057846077223075554,
      "rewards/format_reward": 0.410714291036129,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2870.3988647460938,
      "epoch": 0.028,
      "grad_norm": 0.10985030233860016,
      "kl": 2.8133392333984375e-05,
      "learning_rate": 1.4e-07,
      "loss": 0.0068,
      "reward": 0.27893248095642775,
      "reward_std": 0.7550084367394447,
      "rewards/cosine_scaled_reward": -0.05993851972743869,
      "rewards/format_reward": 0.3988095410168171,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3160.452392578125,
      "epoch": 0.032,
      "grad_norm": 0.10308283567428589,
      "kl": 3.8176774978637695e-05,
      "learning_rate": 1.6e-07,
      "loss": 0.0223,
      "reward": 0.07877065148204565,
      "reward_std": 0.6431715190410614,
      "rewards/cosine_scaled_reward": -0.08263849129434675,
      "rewards/format_reward": 0.2440476268529892,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3020.607177734375,
      "epoch": 0.036,
      "grad_norm": 0.15057384967803955,
      "kl": 3.37064266204834e-05,
      "learning_rate": 1.8e-07,
      "loss": 0.0733,
      "reward": 0.06793000735342503,
      "reward_std": 0.6978132948279381,
      "rewards/cosine_scaled_reward": -0.12079690210521221,
      "rewards/format_reward": 0.3095238171517849,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3089.84521484375,
      "epoch": 0.04,
      "grad_norm": 0.11256518214941025,
      "kl": 3.2395124435424805e-05,
      "learning_rate": 2e-07,
      "loss": 0.0413,
      "reward": 0.032662300392985344,
      "reward_std": 0.6881319805979729,
      "rewards/cosine_scaled_reward": -0.13545456249266863,
      "rewards/format_reward": 0.3035714365541935,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2851.946533203125,
      "epoch": 0.044,
      "grad_norm": 0.17106953263282776,
      "kl": 3.784894943237305e-05,
      "learning_rate": 2.1999999999999998e-07,
      "loss": 0.0636,
      "reward": 0.3718952457420528,
      "reward_std": 0.6902545392513275,
      "rewards/cosine_scaled_reward": -0.03131428617052734,
      "rewards/format_reward": 0.4345238134264946,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2798.5178833007812,
      "epoch": 0.048,
      "grad_norm": 0.1335103064775467,
      "kl": 2.9146671295166016e-05,
      "learning_rate": 2.4e-07,
      "loss": 0.0543,
      "reward": 0.40071453526616096,
      "reward_std": 0.7024472132325172,
      "rewards/cosine_scaled_reward": -0.02285701408982277,
      "rewards/format_reward": 0.4464285746216774,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2948.9464721679688,
      "epoch": 0.052,
      "grad_norm": 0.1271769255399704,
      "kl": 3.698468208312988e-05,
      "learning_rate": 2.6e-07,
      "loss": 0.0693,
      "reward": 0.47545497864484787,
      "reward_std": 0.7740402817726135,
      "rewards/cosine_scaled_reward": 0.002608438953757286,
      "rewards/format_reward": 0.470238097012043,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2679.3928833007812,
      "epoch": 0.056,
      "grad_norm": 0.12242422997951508,
      "kl": 2.8014183044433594e-05,
      "learning_rate": 2.8e-07,
      "loss": 0.0489,
      "reward": 0.40165250562131405,
      "reward_std": 0.7790777683258057,
      "rewards/cosine_scaled_reward": -0.028340420685708523,
      "rewards/format_reward": 0.4583333507180214,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2889.136962890625,
      "epoch": 0.06,
      "grad_norm": 0.19158992171287537,
      "kl": 3.224611282348633e-05,
      "learning_rate": 3e-07,
      "loss": 0.0704,
      "reward": 0.15117042418569326,
      "reward_std": 0.6893174201250076,
      "rewards/cosine_scaled_reward": -0.10893859504722059,
      "rewards/format_reward": 0.3690476231276989,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2892.6488647460938,
      "epoch": 0.064,
      "grad_norm": 0.15633279085159302,
      "kl": 3.668665885925293e-05,
      "learning_rate": 3.2e-07,
      "loss": 0.0733,
      "reward": -0.09426919370889664,
      "reward_std": 0.5802397355437279,
      "rewards/cosine_scaled_reward": -0.18403935432434082,
      "rewards/format_reward": 0.2738095298409462,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2920.3511962890625,
      "epoch": 0.068,
      "grad_norm": 0.12191536277532578,
      "kl": 3.221631050109863e-05,
      "learning_rate": 3.4000000000000003e-07,
      "loss": 0.0214,
      "reward": 0.13339833123609424,
      "reward_std": 0.6428257077932358,
      "rewards/cosine_scaled_reward": -0.11187227349728346,
      "rewards/format_reward": 0.3571428619325161,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2704.672607421875,
      "epoch": 0.072,
      "grad_norm": 0.2207571119070053,
      "kl": 2.4259090423583984e-05,
      "learning_rate": 3.6e-07,
      "loss": 0.0858,
      "reward": 0.4250662699341774,
      "reward_std": 0.7673918604850769,
      "rewards/cosine_scaled_reward": -0.019609727547504008,
      "rewards/format_reward": 0.4642857350409031,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2800.6429443359375,
      "epoch": 0.076,
      "grad_norm": 0.12734168767929077,
      "kl": 2.4378299713134766e-05,
      "learning_rate": 3.7999999999999996e-07,
      "loss": 0.0471,
      "reward": 0.4042445756494999,
      "reward_std": 0.6929292231798172,
      "rewards/cosine_scaled_reward": -0.02704438249929808,
      "rewards/format_reward": 0.4583333432674408,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2697.8274536132812,
      "epoch": 0.08,
      "grad_norm": 0.15472018718719482,
      "kl": 2.35140323638916e-05,
      "learning_rate": 4e-07,
      "loss": 0.0265,
      "reward": 0.3560524769127369,
      "reward_std": 0.6769110411405563,
      "rewards/cosine_scaled_reward": -0.05114044318906963,
      "rewards/format_reward": 0.4583333432674408,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2339.4405517578125,
      "epoch": 0.084,
      "grad_norm": 0.21466447412967682,
      "kl": 2.086162567138672e-05,
      "learning_rate": 4.1999999999999995e-07,
      "loss": 0.0806,
      "reward": 0.7416469305753708,
      "reward_std": 0.841043546795845,
      "rewards/cosine_scaled_reward": 0.06725202780216932,
      "rewards/format_reward": 0.6071428656578064,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2781.5238037109375,
      "epoch": 0.088,
      "grad_norm": 0.19139103591442108,
      "kl": 3.0338764190673828e-05,
      "learning_rate": 4.3999999999999997e-07,
      "loss": 0.0773,
      "reward": 0.20257593411952257,
      "reward_std": 0.7891978472471237,
      "rewards/cosine_scaled_reward": -0.1129977386444807,
      "rewards/format_reward": 0.4285714365541935,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3036.761962890625,
      "epoch": 0.092,
      "grad_norm": 0.1108132153749466,
      "kl": 2.6345252990722656e-05,
      "learning_rate": 4.6e-07,
      "loss": 0.0238,
      "reward": 0.20629926398396492,
      "reward_std": 0.7457813173532486,
      "rewards/cosine_scaled_reward": -0.07542179408483207,
      "rewards/format_reward": 0.3571428619325161,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3143.1131591796875,
      "epoch": 0.096,
      "grad_norm": 0.10591176152229309,
      "kl": 2.703070640563965e-05,
      "learning_rate": 4.8e-07,
      "loss": 0.0637,
      "reward": 0.0749267227947712,
      "reward_std": 0.6808565855026245,
      "rewards/cosine_scaled_reward": -0.1292033027857542,
      "rewards/format_reward": 0.3333333395421505,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2934.4227294921875,
      "epoch": 0.1,
      "grad_norm": 0.12180113047361374,
      "kl": 1.4990568161010742e-05,
      "learning_rate": 5e-07,
      "loss": 0.0525,
      "reward": 0.3605663161724806,
      "reward_std": 0.7757866084575653,
      "rewards/cosine_scaled_reward": -0.028050171211361885,
      "rewards/format_reward": 0.4166666716337204,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3100.5357666015625,
      "epoch": 0.104,
      "grad_norm": 0.14736856520175934,
      "kl": 2.230703830718994e-05,
      "learning_rate": 5.2e-07,
      "loss": 0.087,
      "reward": 0.1489051878452301,
      "reward_std": 0.7608643025159836,
      "rewards/cosine_scaled_reward": -0.08923787740059197,
      "rewards/format_reward": 0.32738095708191395,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2978.452392578125,
      "epoch": 0.108,
      "grad_norm": 0.1490376740694046,
      "kl": 2.7447938919067383e-05,
      "learning_rate": 5.4e-07,
      "loss": 0.0633,
      "reward": 0.16602796246297657,
      "reward_std": 0.762113556265831,
      "rewards/cosine_scaled_reward": -0.09555743727833033,
      "rewards/format_reward": 0.3571428656578064,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2923.1012573242188,
      "epoch": 0.112,
      "grad_norm": 0.11410558968782425,
      "kl": 3.108382225036621e-05,
      "learning_rate": 5.6e-07,
      "loss": 0.0618,
      "reward": 0.058234728407114744,
      "reward_std": 0.5919530540704727,
      "rewards/cosine_scaled_reward": -0.14052549470216036,
      "rewards/format_reward": 0.3392857201397419,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2925.1011962890625,
      "epoch": 0.116,
      "grad_norm": 0.17734545469284058,
      "kl": 5.251169204711914e-05,
      "learning_rate": 5.8e-07,
      "loss": 0.0463,
      "reward": 0.24072746047750115,
      "reward_std": 0.7061209976673126,
      "rewards/cosine_scaled_reward": -0.061183891259133816,
      "rewards/format_reward": 0.3630952425301075,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2882.39892578125,
      "epoch": 0.12,
      "grad_norm": 0.15557299554347992,
      "kl": 2.1502375602722168e-05,
      "learning_rate": 6e-07,
      "loss": 0.0703,
      "reward": 0.22035705484449863,
      "reward_std": 0.5751676708459854,
      "rewards/cosine_scaled_reward": -0.07136909663677216,
      "rewards/format_reward": 0.3630952462553978,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2717.607177734375,
      "epoch": 0.124,
      "grad_norm": 0.16903533041477203,
      "kl": 6.181001663208008e-05,
      "learning_rate": 6.2e-07,
      "loss": 0.07,
      "reward": 0.3481953740119934,
      "reward_std": 0.7361179888248444,
      "rewards/cosine_scaled_reward": -0.025307081639766693,
      "rewards/format_reward": 0.3988095298409462,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2585.244140625,
      "epoch": 0.128,
      "grad_norm": 0.1481872797012329,
      "kl": 0.00023996829986572266,
      "learning_rate": 6.4e-07,
      "loss": 0.0664,
      "reward": 0.5805501043796539,
      "reward_std": 0.805858314037323,
      "rewards/cosine_scaled_reward": 0.019441714510321617,
      "rewards/format_reward": 0.5416666865348816,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2608.7083740234375,
      "epoch": 0.132,
      "grad_norm": 0.10800693184137344,
      "kl": 0.0002808570861816406,
      "learning_rate": 6.6e-07,
      "loss": 0.0255,
      "reward": 0.5432634204626083,
      "reward_std": 0.7363616675138474,
      "rewards/cosine_scaled_reward": 0.02460789866745472,
      "rewards/format_reward": 0.4940476268529892,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2769.1964721679688,
      "epoch": 0.136,
      "grad_norm": 0.12105516344308853,
      "kl": 0.00020498037338256836,
      "learning_rate": 6.800000000000001e-07,
      "loss": 0.0184,
      "reward": 0.18091929703950882,
      "reward_std": 0.6703035831451416,
      "rewards/cosine_scaled_reward": -0.10596893168985844,
      "rewards/format_reward": 0.3928571604192257,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3090.1607666015625,
      "epoch": 0.14,
      "grad_norm": 0.11914981156587601,
      "kl": 0.0002568960189819336,
      "learning_rate": 7e-07,
      "loss": 0.0617,
      "reward": 0.08196480484912172,
      "reward_std": 0.7742973417043686,
      "rewards/cosine_scaled_reward": -0.10782713070511818,
      "rewards/format_reward": 0.2976190559566021,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2790.0596313476562,
      "epoch": 0.144,
      "grad_norm": 0.10883598774671555,
      "kl": 0.00027942657470703125,
      "learning_rate": 7.2e-07,
      "loss": 0.0163,
      "reward": 0.31424143677577376,
      "reward_std": 0.669949933886528,
      "rewards/cosine_scaled_reward": -0.06311738677322865,
      "rewards/format_reward": 0.4404762014746666,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2916.3452758789062,
      "epoch": 0.148,
      "grad_norm": 0.1492447555065155,
      "kl": 0.00025272369384765625,
      "learning_rate": 7.4e-07,
      "loss": 0.0592,
      "reward": 0.1357547640800476,
      "reward_std": 0.7365808188915253,
      "rewards/cosine_scaled_reward": -0.10771786456461996,
      "rewards/format_reward": 0.3511904813349247,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3342.6786499023438,
      "epoch": 0.152,
      "grad_norm": 0.08414288610219955,
      "kl": 0.00013870000839233398,
      "learning_rate": 7.599999999999999e-07,
      "loss": 0.0239,
      "reward": -0.1723631415516138,
      "reward_std": 0.6109825298190117,
      "rewards/cosine_scaled_reward": -0.16951490193605423,
      "rewards/format_reward": 0.16666666977107525,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2762.5774536132812,
      "epoch": 0.156,
      "grad_norm": 0.14042888581752777,
      "kl": 0.0005602836608886719,
      "learning_rate": 7.799999999999999e-07,
      "loss": 0.0588,
      "reward": 0.3224933594465256,
      "reward_std": 0.6976565718650818,
      "rewards/cosine_scaled_reward": -0.04708665423095226,
      "rewards/format_reward": 0.416666679084301,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2729.9644165039062,
      "epoch": 0.16,
      "grad_norm": 0.110390305519104,
      "kl": 0.00016605854034423828,
      "learning_rate": 8e-07,
      "loss": 0.0549,
      "reward": 0.4423699714243412,
      "reward_std": 0.6286562532186508,
      "rewards/cosine_scaled_reward": -0.016910257749259472,
      "rewards/format_reward": 0.476190485060215,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2955.7084350585938,
      "epoch": 0.164,
      "grad_norm": 0.15253609418869019,
      "kl": 0.00040471553802490234,
      "learning_rate": 8.199999999999999e-07,
      "loss": 0.0718,
      "reward": 0.44552009692415595,
      "reward_std": 0.7759689763188362,
      "rewards/cosine_scaled_reward": 0.0144266925053671,
      "rewards/format_reward": 0.4166666753590107,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2961.0596313476562,
      "epoch": 0.168,
      "grad_norm": 0.22110103070735931,
      "kl": 0.0009613037109375,
      "learning_rate": 8.399999999999999e-07,
      "loss": 0.1186,
      "reward": 0.2217194978147745,
      "reward_std": 0.6207270994782448,
      "rewards/cosine_scaled_reward": -0.07366406172513962,
      "rewards/format_reward": 0.3690476268529892,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3015.7738647460938,
      "epoch": 0.172,
      "grad_norm": 0.23720885813236237,
      "kl": 0.0005915164947509766,
      "learning_rate": 8.599999999999999e-07,
      "loss": 0.1245,
      "reward": -0.04521503113210201,
      "reward_std": 0.62105892598629,
      "rewards/cosine_scaled_reward": -0.16844085440970957,
      "rewards/format_reward": 0.2916666716337204,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2937.2381591796875,
      "epoch": 0.176,
      "grad_norm": 0.09096106886863708,
      "kl": 0.00051116943359375,
      "learning_rate": 8.799999999999999e-07,
      "loss": 0.0249,
      "reward": 0.22813843563199043,
      "reward_std": 0.6727291792631149,
      "rewards/cosine_scaled_reward": -0.058549837151076645,
      "rewards/format_reward": 0.3452381007373333,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3149.511962890625,
      "epoch": 0.18,
      "grad_norm": 0.11164555698633194,
      "kl": 0.0004715919494628906,
      "learning_rate": 9e-07,
      "loss": 0.017,
      "reward": 0.05970348231494427,
      "reward_std": 0.7763290405273438,
      "rewards/cosine_scaled_reward": -0.11300540715456009,
      "rewards/format_reward": 0.285714291036129,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3184.6845703125,
      "epoch": 0.184,
      "grad_norm": 0.1711866706609726,
      "kl": 0.0007777214050292969,
      "learning_rate": 9.2e-07,
      "loss": 0.0731,
      "reward": 0.1386737246066332,
      "reward_std": 0.7276585251092911,
      "rewards/cosine_scaled_reward": -0.06459171324968338,
      "rewards/format_reward": 0.26785714738070965,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3015.386962890625,
      "epoch": 0.188,
      "grad_norm": 0.12470238655805588,
      "kl": 0.0014100074768066406,
      "learning_rate": 9.399999999999999e-07,
      "loss": 0.0653,
      "reward": 0.16049158992245793,
      "reward_std": 0.7017006278038025,
      "rewards/cosine_scaled_reward": -0.08642087457701564,
      "rewards/format_reward": 0.3333333432674408,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2909.96435546875,
      "epoch": 0.192,
      "grad_norm": 0.28355535864830017,
      "kl": 0.009288787841796875,
      "learning_rate": 9.6e-07,
      "loss": 0.0581,
      "reward": 0.0768200121819973,
      "reward_std": 0.7135801166296005,
      "rewards/cosine_scaled_reward": -0.14016142301261425,
      "rewards/format_reward": 0.3571428693830967,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2753.6845703125,
      "epoch": 0.196,
      "grad_norm": 0.6567728519439697,
      "kl": 0.023477554321289062,
      "learning_rate": 9.8e-07,
      "loss": 0.0866,
      "reward": 0.4337980281561613,
      "reward_std": 0.8068065047264099,
      "rewards/cosine_scaled_reward": -0.006315283477306366,
      "rewards/format_reward": 0.4464285746216774,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2934.8274536132812,
      "epoch": 0.2,
      "grad_norm": 0.11382321268320084,
      "kl": 0.0025758743286132812,
      "learning_rate": 1e-06,
      "loss": 0.0714,
      "reward": 0.2880665063858032,
      "reward_std": 0.6403830945491791,
      "rewards/cosine_scaled_reward": -0.049419129034504294,
      "rewards/format_reward": 0.3869047649204731,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2840.7678833007812,
      "epoch": 0.204,
      "grad_norm": 0.11641126126050949,
      "kl": 0.0050792694091796875,
      "learning_rate": 9.999890338174275e-07,
      "loss": 0.023,
      "reward": 0.2840890493243933,
      "reward_std": 0.692708894610405,
      "rewards/cosine_scaled_reward": -0.06628882512450218,
      "rewards/format_reward": 0.4166666716337204,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3119.9881591796875,
      "epoch": 0.208,
      "grad_norm": 0.14652805030345917,
      "kl": 0.0032052993774414062,
      "learning_rate": 9.999561358041868e-07,
      "loss": 0.0641,
      "reward": 0.18620363296940923,
      "reward_std": 0.8490904271602631,
      "rewards/cosine_scaled_reward": -0.058683907613158226,
      "rewards/format_reward": 0.3035714365541935,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2926.9940795898438,
      "epoch": 0.212,
      "grad_norm": 0.19241634011268616,
      "kl": 0.00447845458984375,
      "learning_rate": 9.999013075636804e-07,
      "loss": 0.0747,
      "reward": 0.36803684243932366,
      "reward_std": 0.823193870484829,
      "rewards/cosine_scaled_reward": -0.006457769020926207,
      "rewards/format_reward": 0.380952388048172,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3031.7203369140625,
      "epoch": 0.216,
      "grad_norm": 0.12843742966651917,
      "kl": 0.0029668807983398438,
      "learning_rate": 9.998245517681593e-07,
      "loss": 0.0153,
      "reward": 0.2685772944241762,
      "reward_std": 0.6489126533269882,
      "rewards/cosine_scaled_reward": -0.04428278561681509,
      "rewards/format_reward": 0.3571428693830967,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3123.136962890625,
      "epoch": 0.22,
      "grad_norm": 0.1504901498556137,
      "kl": 0.006084442138671875,
      "learning_rate": 9.997258721585931e-07,
      "loss": 0.0691,
      "reward": 0.03501664288341999,
      "reward_std": 0.6481388062238693,
      "rewards/cosine_scaled_reward": -0.11642025248147547,
      "rewards/format_reward": 0.2678571529686451,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2673.434539794922,
      "epoch": 0.224,
      "grad_norm": 0.1224113255739212,
      "kl": 0.0033931732177734375,
      "learning_rate": 9.996052735444862e-07,
      "loss": 0.0269,
      "reward": 0.6296312126796693,
      "reward_std": 0.6751764714717865,
      "rewards/cosine_scaled_reward": 0.07374419644474983,
      "rewards/format_reward": 0.482142873108387,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3170.0535888671875,
      "epoch": 0.228,
      "grad_norm": 0.1044960618019104,
      "kl": 0.0024929046630859375,
      "learning_rate": 9.994627618036452e-07,
      "loss": 0.0262,
      "reward": 0.21022793278098106,
      "reward_std": 0.7194458544254303,
      "rewards/cosine_scaled_reward": -0.05560031719505787,
      "rewards/format_reward": 0.3214285857975483,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2590.0179138183594,
      "epoch": 0.232,
      "grad_norm": 0.13768674433231354,
      "kl": 0.0045948028564453125,
      "learning_rate": 9.992983438818915e-07,
      "loss": 0.0592,
      "reward": 0.4493846707046032,
      "reward_std": 0.7118680775165558,
      "rewards/cosine_scaled_reward": -0.02233148762024939,
      "rewards/format_reward": 0.4940476231276989,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3087.4702758789062,
      "epoch": 0.236,
      "grad_norm": 0.13870052993297577,
      "kl": 0.001789093017578125,
      "learning_rate": 9.991120277927223e-07,
      "loss": 0.0691,
      "reward": 0.3166997814550996,
      "reward_std": 0.7532177269458771,
      "rewards/cosine_scaled_reward": -0.011292967945337296,
      "rewards/format_reward": 0.3392857201397419,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2717.3036499023438,
      "epoch": 0.24,
      "grad_norm": 0.14514827728271484,
      "kl": 0.0060253143310546875,
      "learning_rate": 9.989038226169207e-07,
      "loss": 0.0622,
      "reward": 0.16810212982818484,
      "reward_std": 0.5123014599084854,
      "rewards/cosine_scaled_reward": -0.10642512841150165,
      "rewards/format_reward": 0.3809523843228817,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3105.7977294921875,
      "epoch": 0.244,
      "grad_norm": 0.10494968295097351,
      "kl": 0.0026693344116210938,
      "learning_rate": 9.98673738502114e-07,
      "loss": 0.0639,
      "reward": 0.08244643732905388,
      "reward_std": 0.7039294093847275,
      "rewards/cosine_scaled_reward": -0.10461012227460742,
      "rewards/format_reward": 0.29166666977107525,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3213.3214721679688,
      "epoch": 0.248,
      "grad_norm": 0.0999075248837471,
      "kl": 0.0023097991943359375,
      "learning_rate": 9.98421786662277e-07,
      "loss": 0.053,
      "reward": 0.09679291397333145,
      "reward_std": 0.7138089835643768,
      "rewards/cosine_scaled_reward": -0.0914845080114901,
      "rewards/format_reward": 0.2797619104385376,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2783.9584350585938,
      "epoch": 0.252,
      "grad_norm": 0.19832438230514526,
      "kl": 0.0027294158935546875,
      "learning_rate": 9.981479793771866e-07,
      "loss": 0.0773,
      "reward": 0.2238014191389084,
      "reward_std": 0.6036202609539032,
      "rewards/cosine_scaled_reward": -0.06369452457875013,
      "rewards/format_reward": 0.351190485060215,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2976.6607666015625,
      "epoch": 0.256,
      "grad_norm": 0.12335456907749176,
      "kl": 0.0020885467529296875,
      "learning_rate": 9.97852329991824e-07,
      "loss": 0.0857,
      "reward": 0.27347568422555923,
      "reward_std": 0.7463532835245132,
      "rewards/cosine_scaled_reward": -0.03885740428813733,
      "rewards/format_reward": 0.3511904887855053,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2825.6130981445312,
      "epoch": 0.26,
      "grad_norm": 0.13863790035247803,
      "kl": 0.002285003662109375,
      "learning_rate": 9.975348529157229e-07,
      "loss": 0.0349,
      "reward": 0.3712610546499491,
      "reward_std": 0.7277249395847321,
      "rewards/cosine_scaled_reward": -0.037583764642477036,
      "rewards/format_reward": 0.4464285746216774,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2974.232177734375,
      "epoch": 0.264,
      "grad_norm": 0.11186616122722626,
      "kl": 0.002407073974609375,
      "learning_rate": 9.971955636222684e-07,
      "loss": 0.0025,
      "reward": 0.07817286718636751,
      "reward_std": 0.640307292342186,
      "rewards/cosine_scaled_reward": -0.12460404448211193,
      "rewards/format_reward": 0.3273809589445591,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3152.702392578125,
      "epoch": 0.268,
      "grad_norm": 0.12694397568702698,
      "kl": 0.00250244140625,
      "learning_rate": 9.968344786479415e-07,
      "loss": 0.07,
      "reward": 0.1549822874367237,
      "reward_std": 0.6663320288062096,
      "rewards/cosine_scaled_reward": -0.08322314161341637,
      "rewards/format_reward": 0.3214285783469677,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2938.0179443359375,
      "epoch": 0.272,
      "grad_norm": 0.12655070424079895,
      "kl": 0.00341033935546875,
      "learning_rate": 9.964516155915151e-07,
      "loss": 0.0545,
      "reward": 0.2076467089354992,
      "reward_std": 0.7705407291650772,
      "rewards/cosine_scaled_reward": -0.08070044964551926,
      "rewards/format_reward": 0.3690476268529892,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3025.8095703125,
      "epoch": 0.276,
      "grad_norm": 0.12574610114097595,
      "kl": 0.003986358642578125,
      "learning_rate": 9.960469931131936e-07,
      "loss": 0.0227,
      "reward": 0.03160261735320091,
      "reward_std": 0.621779277920723,
      "rewards/cosine_scaled_reward": -0.13003203552216291,
      "rewards/format_reward": 0.2916666716337204,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2877.5536499023438,
      "epoch": 0.28,
      "grad_norm": 0.12181198596954346,
      "kl": 0.0038127899169921875,
      "learning_rate": 9.956206309337066e-07,
      "loss": 0.0248,
      "reward": 0.2757916431874037,
      "reward_std": 0.7794490903615952,
      "rewards/cosine_scaled_reward": -0.06448512757197022,
      "rewards/format_reward": 0.4047619178891182,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2776.6131591796875,
      "epoch": 0.284,
      "grad_norm": 0.17934156954288483,
      "kl": 0.0039825439453125,
      "learning_rate": 9.951725498333448e-07,
      "loss": 0.1072,
      "reward": 0.2793612889945507,
      "reward_std": 0.7607921361923218,
      "rewards/cosine_scaled_reward": -0.06865269318223,
      "rewards/format_reward": 0.4166666716337204,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3037.5178833007812,
      "epoch": 0.288,
      "grad_norm": 0.1311851143836975,
      "kl": 0.0038604736328125,
      "learning_rate": 9.947027716509488e-07,
      "loss": 0.0745,
      "reward": 0.34610075503587723,
      "reward_std": 0.8604296147823334,
      "rewards/cosine_scaled_reward": -0.01444962713867426,
      "rewards/format_reward": 0.3750000074505806,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2880.3036499023438,
      "epoch": 0.292,
      "grad_norm": 0.10903972387313843,
      "kl": 0.005123138427734375,
      "learning_rate": 9.942113192828444e-07,
      "loss": 0.0247,
      "reward": 0.41264417115598917,
      "reward_std": 0.6988394409418106,
      "rewards/cosine_scaled_reward": -0.00201124744489789,
      "rewards/format_reward": 0.4166666753590107,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2689.5774536132812,
      "epoch": 0.296,
      "grad_norm": 0.1187940314412117,
      "kl": 0.00371551513671875,
      "learning_rate": 9.93698216681727e-07,
      "loss": 0.0343,
      "reward": 0.49459290131926537,
      "reward_std": 0.6582471132278442,
      "rewards/cosine_scaled_reward": -0.005679763096850365,
      "rewards/format_reward": 0.505952388048172,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3011.2440795898438,
      "epoch": 0.3,
      "grad_norm": 0.15027488768100739,
      "kl": 0.00612640380859375,
      "learning_rate": 9.931634888554935e-07,
      "loss": 0.0922,
      "reward": -0.12369688227772713,
      "reward_std": 0.5938592255115509,
      "rewards/cosine_scaled_reward": -0.1957770138978958,
      "rewards/format_reward": 0.26785715110599995,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3087.9762573242188,
      "epoch": 0.304,
      "grad_norm": 0.1342087835073471,
      "kl": 0.004589080810546875,
      "learning_rate": 9.926071618660237e-07,
      "loss": 0.051,
      "reward": 0.029461721424013376,
      "reward_std": 0.5008194297552109,
      "rewards/cosine_scaled_reward": -0.12812629727704916,
      "rewards/format_reward": 0.2857142915017903,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2708.6845703125,
      "epoch": 0.308,
      "grad_norm": 0.11936229467391968,
      "kl": 0.00518035888671875,
      "learning_rate": 9.9202926282791e-07,
      "loss": 0.0693,
      "reward": 0.38730931747704744,
      "reward_std": 0.6931557953357697,
      "rewards/cosine_scaled_reward": -0.0503929746337235,
      "rewards/format_reward": 0.4880952388048172,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3123.4345703125,
      "epoch": 0.312,
      "grad_norm": 0.10926749557256699,
      "kl": 0.006168365478515625,
      "learning_rate": 9.91429819907136e-07,
      "loss": 0.0281,
      "reward": 0.14759791223332286,
      "reward_std": 0.6552696228027344,
      "rewards/cosine_scaled_reward": -0.08989150635898113,
      "rewards/format_reward": 0.3273809519596398,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3104.886962890625,
      "epoch": 0.316,
      "grad_norm": 0.17055809497833252,
      "kl": 0.005580902099609375,
      "learning_rate": 9.908088623197048e-07,
      "loss": 0.122,
      "reward": 0.1187831275165081,
      "reward_std": 0.7589289993047714,
      "rewards/cosine_scaled_reward": -0.11917985696345568,
      "rewards/format_reward": 0.3571428656578064,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2892.2261962890625,
      "epoch": 0.32,
      "grad_norm": 0.12601953744888306,
      "kl": 0.004444122314453125,
      "learning_rate": 9.901664203302124e-07,
      "loss": 0.0261,
      "reward": 0.2744547198526561,
      "reward_std": 0.686069905757904,
      "rewards/cosine_scaled_reward": -0.05920121353119612,
      "rewards/format_reward": 0.3928571492433548,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3041.1488647460938,
      "epoch": 0.324,
      "grad_norm": 0.11552488803863525,
      "kl": 0.00655364990234375,
      "learning_rate": 9.895025252503755e-07,
      "loss": 0.0229,
      "reward": 0.12106413394212723,
      "reward_std": 0.675617903470993,
      "rewards/cosine_scaled_reward": -0.1061346041969955,
      "rewards/format_reward": 0.33333333767950535,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3129.6429443359375,
      "epoch": 0.328,
      "grad_norm": 0.09706410765647888,
      "kl": 0.0055084228515625,
      "learning_rate": 9.888172094375033e-07,
      "loss": 0.0452,
      "reward": 0.12287123966962099,
      "reward_std": 0.7173575460910797,
      "rewards/cosine_scaled_reward": -0.09035009983927011,
      "rewards/format_reward": 0.3035714365541935,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2602.047637939453,
      "epoch": 0.332,
      "grad_norm": 0.11233574151992798,
      "kl": 0.01010894775390625,
      "learning_rate": 9.881105062929221e-07,
      "loss": 0.0258,
      "reward": 0.35400932375341654,
      "reward_std": 0.6462785750627518,
      "rewards/cosine_scaled_reward": -0.06109058950096369,
      "rewards/format_reward": 0.476190485060215,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2894.011962890625,
      "epoch": 0.336,
      "grad_norm": 0.12059750407934189,
      "kl": 0.00872802734375,
      "learning_rate": 9.873824502603459e-07,
      "loss": 0.0806,
      "reward": 0.2941260803490877,
      "reward_std": 0.78522889316082,
      "rewards/cosine_scaled_reward": -0.06722268275916576,
      "rewards/format_reward": 0.4285714328289032,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2979.226318359375,
      "epoch": 0.34,
      "grad_norm": 0.15206296741962433,
      "kl": 0.00783538818359375,
      "learning_rate": 9.866330768241983e-07,
      "loss": 0.0755,
      "reward": 0.21889091655611992,
      "reward_std": 0.6674999743700027,
      "rewards/cosine_scaled_reward": -0.06614979542791843,
      "rewards/format_reward": 0.3511904887855053,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2819.5892944335938,
      "epoch": 0.344,
      "grad_norm": 0.1093529462814331,
      "kl": 0.00661468505859375,
      "learning_rate": 9.85862422507884e-07,
      "loss": 0.0353,
      "reward": 0.20389786185114644,
      "reward_std": 0.6942542195320129,
      "rewards/cosine_scaled_reward": -0.07959869271144271,
      "rewards/format_reward": 0.3630952425301075,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2616.452392578125,
      "epoch": 0.348,
      "grad_norm": 0.15942011773586273,
      "kl": 0.00847625732421875,
      "learning_rate": 9.850705248720068e-07,
      "loss": 0.0773,
      "reward": 0.3760679364204407,
      "reward_std": 0.6467384025454521,
      "rewards/cosine_scaled_reward": -0.05898985452950001,
      "rewards/format_reward": 0.4940476194024086,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2923.7500610351562,
      "epoch": 0.352,
      "grad_norm": 0.10565865784883499,
      "kl": 0.0073089599609375,
      "learning_rate": 9.8425742251254e-07,
      "loss": 0.0227,
      "reward": 0.3185804970562458,
      "reward_std": 0.6098055616021156,
      "rewards/cosine_scaled_reward": -0.010352615499868989,
      "rewards/format_reward": 0.3392857201397419,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2752.6309814453125,
      "epoch": 0.356,
      "grad_norm": 0.1690302938222885,
      "kl": 0.0132293701171875,
      "learning_rate": 9.83423155058946e-07,
      "loss": 0.0312,
      "reward": 0.46914676763117313,
      "reward_std": 0.7854363918304443,
      "rewards/cosine_scaled_reward": 0.005406718701124191,
      "rewards/format_reward": 0.4583333432674408,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2769.1488647460938,
      "epoch": 0.36,
      "grad_norm": 0.17653611302375793,
      "kl": 0.011993408203125,
      "learning_rate": 9.825677631722435e-07,
      "loss": 0.0815,
      "reward": 0.3604448903352022,
      "reward_std": 0.798264317214489,
      "rewards/cosine_scaled_reward": -0.04894421715289354,
      "rewards/format_reward": 0.4583333432674408,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3187.3631591796875,
      "epoch": 0.364,
      "grad_norm": 0.1068400964140892,
      "kl": 0.0078277587890625,
      "learning_rate": 9.816912885430258e-07,
      "loss": 0.0211,
      "reward": -0.03608314320445061,
      "reward_std": 0.5826699808239937,
      "rewards/cosine_scaled_reward": -0.14006539154797792,
      "rewards/format_reward": 0.2440476305782795,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2924.916748046875,
      "epoch": 0.368,
      "grad_norm": 0.17665976285934448,
      "kl": 0.0091400146484375,
      "learning_rate": 9.807937738894303e-07,
      "loss": 0.0852,
      "reward": 0.2780441716313362,
      "reward_std": 0.7524297386407852,
      "rewards/cosine_scaled_reward": -0.07526363711804152,
      "rewards/format_reward": 0.4285714291036129,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2939.3512573242188,
      "epoch": 0.372,
      "grad_norm": 0.13597099483013153,
      "kl": 0.0076446533203125,
      "learning_rate": 9.798752629550546e-07,
      "loss": 0.0324,
      "reward": 0.33797190338373184,
      "reward_std": 0.5880008786916733,
      "rewards/cosine_scaled_reward": -0.01851405529305339,
      "rewards/format_reward": 0.3750000111758709,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2922.875,
      "epoch": 0.376,
      "grad_norm": 0.12246444076299667,
      "kl": 0.00815582275390625,
      "learning_rate": 9.78935800506826e-07,
      "loss": 0.0801,
      "reward": 0.14625070057809353,
      "reward_std": 0.690229170024395,
      "rewards/cosine_scaled_reward": -0.12330322340130806,
      "rewards/format_reward": 0.3928571492433548,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3222.3632202148438,
      "epoch": 0.38,
      "grad_norm": 0.08992121368646622,
      "kl": 0.00704193115234375,
      "learning_rate": 9.779754323328192e-07,
      "loss": -0.001,
      "reward": 0.26800261437892914,
      "reward_std": 0.7103277295827866,
      "rewards/cosine_scaled_reward": -0.011832039803266525,
      "rewards/format_reward": 0.2916666716337204,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3215.2202758789062,
      "epoch": 0.384,
      "grad_norm": 0.12997964024543762,
      "kl": 0.00983428955078125,
      "learning_rate": 9.769942052400235e-07,
      "loss": 0.0591,
      "reward": -0.03136127255856991,
      "reward_std": 0.6326467096805573,
      "rewards/cosine_scaled_reward": -0.18234730698168278,
      "rewards/format_reward": 0.33333333767950535,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2857.0416870117188,
      "epoch": 0.388,
      "grad_norm": 0.16558504104614258,
      "kl": 0.007293701171875,
      "learning_rate": 9.759921670520634e-07,
      "loss": 0.0821,
      "reward": 0.19707820191979408,
      "reward_std": 0.6188783794641495,
      "rewards/cosine_scaled_reward": -0.10681804455816746,
      "rewards/format_reward": 0.410714291036129,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2983.9940795898438,
      "epoch": 0.392,
      "grad_norm": 0.1882523149251938,
      "kl": 0.00878143310546875,
      "learning_rate": 9.749693666068663e-07,
      "loss": 0.1027,
      "reward": 0.3959239423274994,
      "reward_std": 0.875861182808876,
      "rewards/cosine_scaled_reward": 0.004509590216912329,
      "rewards/format_reward": 0.3869047686457634,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3077.0952758789062,
      "epoch": 0.396,
      "grad_norm": 0.1565464437007904,
      "kl": 0.011505126953125,
      "learning_rate": 9.739258537542835e-07,
      "loss": 0.0968,
      "reward": 0.027048692107200623,
      "reward_std": 0.7064545601606369,
      "rewards/cosine_scaled_reward": -0.1323089925572276,
      "rewards/format_reward": 0.2916666679084301,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2358.2381591796875,
      "epoch": 0.4,
      "grad_norm": 0.14640696346759796,
      "kl": 0.00804901123046875,
      "learning_rate": 9.728616793536587e-07,
      "loss": 0.0774,
      "reward": 0.8142919540405273,
      "reward_std": 0.7067123055458069,
      "rewards/cosine_scaled_reward": 0.10357456840574741,
      "rewards/format_reward": 0.6071428507566452,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3195.6428833007812,
      "epoch": 0.404,
      "grad_norm": 0.14821472764015198,
      "kl": 0.0126495361328125,
      "learning_rate": 9.717768952713511e-07,
      "loss": 0.0897,
      "reward": 0.2797414679080248,
      "reward_std": 0.8859200328588486,
      "rewards/cosine_scaled_reward": -0.0535816540941596,
      "rewards/format_reward": 0.3869047649204731,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3046.7798461914062,
      "epoch": 0.408,
      "grad_norm": 0.13244982063770294,
      "kl": 0.0107879638671875,
      "learning_rate": 9.706715543782064e-07,
      "loss": 0.0638,
      "reward": 0.16126136109232903,
      "reward_std": 0.6933339387178421,
      "rewards/cosine_scaled_reward": -0.08305979892611504,
      "rewards/format_reward": 0.3273809552192688,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2718.8275146484375,
      "epoch": 0.412,
      "grad_norm": 0.20267100632190704,
      "kl": 0.01003265380859375,
      "learning_rate": 9.695457105469804e-07,
      "loss": 0.1246,
      "reward": 0.4206714928150177,
      "reward_std": 0.6706456393003464,
      "rewards/cosine_scaled_reward": -0.0069261584430933,
      "rewards/format_reward": 0.43452382180839777,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2973.7559814453125,
      "epoch": 0.416,
      "grad_norm": 0.12351427227258682,
      "kl": 0.01038360595703125,
      "learning_rate": 9.683994186497132e-07,
      "loss": 0.0716,
      "reward": 0.20578511937389976,
      "reward_std": 0.7138822227716446,
      "rewards/cosine_scaled_reward": -0.0846074327128008,
      "rewards/format_reward": 0.3750000037252903,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2596.1250915527344,
      "epoch": 0.42,
      "grad_norm": 0.13755492866039276,
      "kl": 0.0099945068359375,
      "learning_rate": 9.672327345550543e-07,
      "loss": 0.0544,
      "reward": 0.6021162122488022,
      "reward_std": 0.8435305505990982,
      "rewards/cosine_scaled_reward": 0.012367631308734417,
      "rewards/format_reward": 0.5773809663951397,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2986.226318359375,
      "epoch": 0.424,
      "grad_norm": 0.10337146371603012,
      "kl": 0.014007568359375,
      "learning_rate": 9.66045715125541e-07,
      "loss": 0.0441,
      "reward": 0.24333537928760052,
      "reward_std": 0.7411631494760513,
      "rewards/cosine_scaled_reward": -0.08071326930075884,
      "rewards/format_reward": 0.4047619141638279,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2804.7500610351562,
      "epoch": 0.428,
      "grad_norm": 0.1334601491689682,
      "kl": 0.010711669921875,
      "learning_rate": 9.648384182148252e-07,
      "loss": 0.0741,
      "reward": 0.21579574886709452,
      "reward_std": 0.559941440820694,
      "rewards/cosine_scaled_reward": -0.08257831074297428,
      "rewards/format_reward": 0.380952388048172,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2947.9583740234375,
      "epoch": 0.432,
      "grad_norm": 0.15339502692222595,
      "kl": 0.0123443603515625,
      "learning_rate": 9.636109026648554e-07,
      "loss": 0.0797,
      "reward": 0.2714387159794569,
      "reward_std": 0.5535019040107727,
      "rewards/cosine_scaled_reward": -0.060709220357239246,
      "rewards/format_reward": 0.3928571492433548,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3165.3631591796875,
      "epoch": 0.436,
      "grad_norm": 0.14555484056472778,
      "kl": 0.0123748779296875,
      "learning_rate": 9.623632283030077e-07,
      "loss": 0.0689,
      "reward": 0.3741426505148411,
      "reward_std": 0.7712415158748627,
      "rewards/cosine_scaled_reward": 0.011476085986942053,
      "rewards/format_reward": 0.351190485060215,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2682.607177734375,
      "epoch": 0.44,
      "grad_norm": 3.0458719730377197,
      "kl": 0.1771697998046875,
      "learning_rate": 9.610954559391704e-07,
      "loss": 0.0576,
      "reward": 0.43371669203042984,
      "reward_std": 0.6959643810987473,
      "rewards/cosine_scaled_reward": -0.02718928176909685,
      "rewards/format_reward": 0.4880952462553978,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2601.636993408203,
      "epoch": 0.444,
      "grad_norm": 0.16071970760822296,
      "kl": 0.0132904052734375,
      "learning_rate": 9.598076473627796e-07,
      "loss": 0.0475,
      "reward": 0.3634342849254608,
      "reward_std": 0.5500286221504211,
      "rewards/cosine_scaled_reward": -0.06233047042042017,
      "rewards/format_reward": 0.4880952462553978,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2848.0,
      "epoch": 0.448,
      "grad_norm": 0.11652833968400955,
      "kl": 0.01318359375,
      "learning_rate": 9.58499865339809e-07,
      "loss": 0.0216,
      "reward": 0.4104595482349396,
      "reward_std": 0.7775004655122757,
      "rewards/cosine_scaled_reward": -0.023936893790960312,
      "rewards/format_reward": 0.4583333358168602,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3060.696533203125,
      "epoch": 0.452,
      "grad_norm": 0.11911512911319733,
      "kl": 0.019012451171875,
      "learning_rate": 9.571721736097088e-07,
      "loss": 0.0351,
      "reward": 0.14249714091420174,
      "reward_std": 0.5928184911608696,
      "rewards/cosine_scaled_reward": -0.08946572133572772,
      "rewards/format_reward": 0.32142857648432255,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3025.3392944335938,
      "epoch": 0.456,
      "grad_norm": 0.11119002103805542,
      "kl": 0.013275146484375,
      "learning_rate": 9.55824636882301e-07,
      "loss": 0.0157,
      "reward": 0.2583576124161482,
      "reward_std": 0.5952321216464043,
      "rewards/cosine_scaled_reward": -0.05236881226301193,
      "rewards/format_reward": 0.3630952462553978,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2642.4524536132812,
      "epoch": 0.46,
      "grad_norm": 0.13256801664829254,
      "kl": 0.0142669677734375,
      "learning_rate": 9.54457320834625e-07,
      "loss": 0.0464,
      "reward": 0.26410975866019726,
      "reward_std": 0.7131557315587997,
      "rewards/cosine_scaled_reward": -0.10604035668075085,
      "rewards/format_reward": 0.476190485060215,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2842.172607421875,
      "epoch": 0.464,
      "grad_norm": 0.14555224776268005,
      "kl": 0.0131683349609375,
      "learning_rate": 9.530702921077358e-07,
      "loss": 0.06,
      "reward": 0.7474905252456665,
      "reward_std": 0.9560296833515167,
      "rewards/cosine_scaled_reward": 0.10291192133445293,
      "rewards/format_reward": 0.5416666716337204,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2644.851318359375,
      "epoch": 0.468,
      "grad_norm": 0.1801517903804779,
      "kl": 0.0159149169921875,
      "learning_rate": 9.516636183034564e-07,
      "loss": 0.0868,
      "reward": 0.3559920974075794,
      "reward_std": 0.6948887631297112,
      "rewards/cosine_scaled_reward": -0.05712300445884466,
      "rewards/format_reward": 0.4702381044626236,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2898.8692016601562,
      "epoch": 0.472,
      "grad_norm": 0.11308304965496063,
      "kl": 0.01580810546875,
      "learning_rate": 9.502373679810839e-07,
      "loss": 0.0347,
      "reward": -0.02927885064855218,
      "reward_std": 0.5874328389763832,
      "rewards/cosine_scaled_reward": -0.18725845962762833,
      "rewards/format_reward": 0.3452381044626236,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2942.3869018554688,
      "epoch": 0.476,
      "grad_norm": 0.11883487552404404,
      "kl": 0.0146484375,
      "learning_rate": 9.487916106540465e-07,
      "loss": 0.0533,
      "reward": 0.2897670716047287,
      "reward_std": 0.624944195151329,
      "rewards/cosine_scaled_reward": -0.05452123726718128,
      "rewards/format_reward": 0.3988095298409462,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2888.5537109375,
      "epoch": 0.48,
      "grad_norm": 0.18806912004947662,
      "kl": 0.0160980224609375,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.0988,
      "reward": 0.42369477450847626,
      "reward_std": 0.8195747882127762,
      "rewards/cosine_scaled_reward": -0.002438324736431241,
      "rewards/format_reward": 0.4285714365541935,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2582.732177734375,
      "epoch": 0.484,
      "grad_norm": 0.24437126517295837,
      "kl": 0.0138702392578125,
      "learning_rate": 9.458418577899774e-07,
      "loss": 0.1221,
      "reward": 0.5238880245015025,
      "reward_std": 0.7648549973964691,
      "rewards/cosine_scaled_reward": -0.005913139786571264,
      "rewards/format_reward": 0.535714291036129,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2693.202392578125,
      "epoch": 0.488,
      "grad_norm": 0.1520787924528122,
      "kl": 0.016265869140625,
      "learning_rate": 9.443380060197385e-07,
      "loss": 0.075,
      "reward": 0.5096995830535889,
      "reward_std": 0.8008040487766266,
      "rewards/cosine_scaled_reward": 0.01675456203520298,
      "rewards/format_reward": 0.4761904776096344,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2794.4822387695312,
      "epoch": 0.492,
      "grad_norm": 0.276404470205307,
      "kl": 0.016845703125,
      "learning_rate": 9.428149347714143e-07,
      "loss": 0.1229,
      "reward": 0.1483494946733117,
      "reward_std": 0.7319334298372269,
      "rewards/cosine_scaled_reward": -0.11034906562417746,
      "rewards/format_reward": 0.3690476268529892,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2586.9285583496094,
      "epoch": 0.496,
      "grad_norm": 0.19590145349502563,
      "kl": 0.0192413330078125,
      "learning_rate": 9.412727182773486e-07,
      "loss": 0.0668,
      "reward": 0.42041725292801857,
      "reward_std": 0.7440174072980881,
      "rewards/cosine_scaled_reward": -0.05169615335762501,
      "rewards/format_reward": 0.5238095372915268,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2647.1309814453125,
      "epoch": 0.5,
      "grad_norm": 0.1762569099664688,
      "kl": 0.018951416015625,
      "learning_rate": 9.397114317029974e-07,
      "loss": 0.058,
      "reward": 0.26979109086096287,
      "reward_std": 0.7384046316146851,
      "rewards/cosine_scaled_reward": -0.10022350586950779,
      "rewards/format_reward": 0.4702380932867527,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2329.4642639160156,
      "epoch": 0.504,
      "grad_norm": 0.2497519552707672,
      "kl": 0.016754150390625,
      "learning_rate": 9.381311511432658e-07,
      "loss": 0.1113,
      "reward": 0.5470606535673141,
      "reward_std": 0.7599766105413437,
      "rewards/cosine_scaled_reward": -0.04194586584344506,
      "rewards/format_reward": 0.630952388048172,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2630.851318359375,
      "epoch": 0.508,
      "grad_norm": 0.24775269627571106,
      "kl": 0.018798828125,
      "learning_rate": 9.36531953618799e-07,
      "loss": 0.0886,
      "reward": 0.5138388648629189,
      "reward_std": 0.8158636838197708,
      "rewards/cosine_scaled_reward": 0.00989562287577428,
      "rewards/format_reward": 0.4940476194024086,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2193.1250610351562,
      "epoch": 0.512,
      "grad_norm": 0.3206213712692261,
      "kl": 0.0168914794921875,
      "learning_rate": 9.34913917072228e-07,
      "loss": 0.1596,
      "reward": 0.7910499274730682,
      "reward_std": 0.7149495184421539,
      "rewards/cosine_scaled_reward": 0.0502868490293622,
      "rewards/format_reward": 0.6904762089252472,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2532.4642944335938,
      "epoch": 0.516,
      "grad_norm": 0.28250807523727417,
      "kl": 0.021270751953125,
      "learning_rate": 9.332771203643714e-07,
      "loss": 0.0845,
      "reward": 0.36558002047240734,
      "reward_std": 0.637114867568016,
      "rewards/cosine_scaled_reward": -0.05232903314754367,
      "rewards/format_reward": 0.470238097012043,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2768.4702758789062,
      "epoch": 0.52,
      "grad_norm": 0.26120948791503906,
      "kl": 0.026092529296875,
      "learning_rate": 9.316216432703916e-07,
      "loss": 0.1052,
      "reward": 0.41776999086141586,
      "reward_std": 0.9506262838840485,
      "rewards/cosine_scaled_reward": -0.029210255946964025,
      "rewards/format_reward": 0.4761904776096344,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2400.5000915527344,
      "epoch": 0.524,
      "grad_norm": 0.33211514353752136,
      "kl": 0.0215606689453125,
      "learning_rate": 9.299475664759068e-07,
      "loss": 0.1452,
      "reward": 0.42596414871513844,
      "reward_std": 0.6515605002641678,
      "rewards/cosine_scaled_reward": -0.04892268590629101,
      "rewards/format_reward": 0.5238095298409462,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2368.9524536132812,
      "epoch": 0.528,
      "grad_norm": 0.41833382844924927,
      "kl": 0.020782470703125,
      "learning_rate": 9.282549715730579e-07,
      "loss": 0.1107,
      "reward": 0.5763177648186684,
      "reward_std": 0.7463207244873047,
      "rewards/cosine_scaled_reward": 0.005420786794275045,
      "rewards/format_reward": 0.5654762089252472,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2800.4940795898438,
      "epoch": 0.532,
      "grad_norm": 0.21257169544696808,
      "kl": 0.031768798828125,
      "learning_rate": 9.265439410565328e-07,
      "loss": 0.0654,
      "reward": 0.27612858824431896,
      "reward_std": 0.6431511640548706,
      "rewards/cosine_scaled_reward": -0.07026905845850706,
      "rewards/format_reward": 0.4166666679084301,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2558.386962890625,
      "epoch": 0.536,
      "grad_norm": 0.42514950037002563,
      "kl": 0.02978515625,
      "learning_rate": 9.248145583195447e-07,
      "loss": 0.1228,
      "reward": 0.4270520806312561,
      "reward_std": 0.7729989290237427,
      "rewards/cosine_scaled_reward": -0.02159299748018384,
      "rewards/format_reward": 0.4702381044626236,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2229.363067626953,
      "epoch": 0.54,
      "grad_norm": 0.33076903223991394,
      "kl": 0.03668212890625,
      "learning_rate": 9.230669076497687e-07,
      "loss": 0.1019,
      "reward": 0.28207028564065695,
      "reward_std": 0.6516975909471512,
      "rewards/cosine_scaled_reward": -0.10301248356699944,
      "rewards/format_reward": 0.4880952388048172,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2353.029815673828,
      "epoch": 0.544,
      "grad_norm": 0.3519177734851837,
      "kl": 0.035308837890625,
      "learning_rate": 9.213010742252327e-07,
      "loss": 0.0997,
      "reward": 0.37077474407851696,
      "reward_std": 0.668467104434967,
      "rewards/cosine_scaled_reward": -0.0675888154655695,
      "rewards/format_reward": 0.505952388048172,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2052.1964721679688,
      "epoch": 0.548,
      "grad_norm": 0.23379026353359222,
      "kl": 0.035400390625,
      "learning_rate": 9.195171441101668e-07,
      "loss": 0.058,
      "reward": 0.6550269052386284,
      "reward_std": 0.6502309143543243,
      "rewards/cosine_scaled_reward": 0.009061065968126059,
      "rewards/format_reward": 0.6369047611951828,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2325.732177734375,
      "epoch": 0.552,
      "grad_norm": 0.19041714072227478,
      "kl": 0.0423583984375,
      "learning_rate": 9.177152042508077e-07,
      "loss": 0.0232,
      "reward": 0.561458358541131,
      "reward_std": 0.9615298509597778,
      "rewards/cosine_scaled_reward": 0.012872030027210712,
      "rewards/format_reward": 0.5357142947614193,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2440.327423095703,
      "epoch": 0.556,
      "grad_norm": 0.2846536934375763,
      "kl": 0.0457763671875,
      "learning_rate": 9.158953424711624e-07,
      "loss": 0.0439,
      "reward": 0.44825945422053337,
      "reward_std": 0.7441610246896744,
      "rewards/cosine_scaled_reward": -0.022894082590937614,
      "rewards/format_reward": 0.4940476417541504,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2455.071533203125,
      "epoch": 0.56,
      "grad_norm": 0.5667356252670288,
      "kl": 0.05535888671875,
      "learning_rate": 9.140576474687263e-07,
      "loss": 0.1203,
      "reward": 0.42634591602836736,
      "reward_std": 0.6539553329348564,
      "rewards/cosine_scaled_reward": 0.007815815508365631,
      "rewards/format_reward": 0.4107142984867096,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2289.0416870117188,
      "epoch": 0.564,
      "grad_norm": 0.2632148861885071,
      "kl": 0.06378173828125,
      "learning_rate": 9.122022088101613e-07,
      "loss": 0.0588,
      "reward": 0.25764250196516514,
      "reward_std": 0.5646726861596107,
      "rewards/cosine_scaled_reward": -0.07355970796197653,
      "rewards/format_reward": 0.4047619141638279,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2396.1011962890625,
      "epoch": 0.568,
      "grad_norm": 0.48258456587791443,
      "kl": 0.070556640625,
      "learning_rate": 9.103291169269299e-07,
      "loss": 0.1188,
      "reward": 0.3830295614898205,
      "reward_std": 0.7874267548322678,
      "rewards/cosine_scaled_reward": 0.0069909729063510895,
      "rewards/format_reward": 0.3690476268529892,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2465.184539794922,
      "epoch": 0.572,
      "grad_norm": 0.3696215748786926,
      "kl": 0.083984375,
      "learning_rate": 9.084384631108882e-07,
      "loss": 0.0378,
      "reward": 0.17246808065101504,
      "reward_std": 0.7914570420980453,
      "rewards/cosine_scaled_reward": -0.11614691279828548,
      "rewards/format_reward": 0.4047619104385376,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2460.184600830078,
      "epoch": 0.576,
      "grad_norm": 0.30795326828956604,
      "kl": 0.08349609375,
      "learning_rate": 9.065303395098358e-07,
      "loss": 0.0254,
      "reward": 0.23997123539447784,
      "reward_std": 0.7133302837610245,
      "rewards/cosine_scaled_reward": -0.09430009685456753,
      "rewards/format_reward": 0.4285714365541935,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2376.089324951172,
      "epoch": 0.58,
      "grad_norm": 0.5491130352020264,
      "kl": 0.0899658203125,
      "learning_rate": 9.046048391230247e-07,
      "loss": 0.0823,
      "reward": 0.4339366629719734,
      "reward_std": 0.7774848788976669,
      "rewards/cosine_scaled_reward": 0.005658812588080764,
      "rewards/format_reward": 0.4226190559566021,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2340.136962890625,
      "epoch": 0.584,
      "grad_norm": 0.3470991551876068,
      "kl": 0.1185302734375,
      "learning_rate": 9.026620557966279e-07,
      "loss": 0.0431,
      "reward": 0.31324461475014687,
      "reward_std": 0.7800580561161041,
      "rewards/cosine_scaled_reward": -0.10528245754539967,
      "rewards/format_reward": 0.5238095298409462,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2512.7262573242188,
      "epoch": 0.588,
      "grad_norm": 0.31661325693130493,
      "kl": 0.1114501953125,
      "learning_rate": 9.007020842191634e-07,
      "loss": 0.0189,
      "reward": 0.2997382581233978,
      "reward_std": 0.8120257556438446,
      "rewards/cosine_scaled_reward": -0.06144038587808609,
      "rewards/format_reward": 0.4226190485060215,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2311.8035888671875,
      "epoch": 0.592,
      "grad_norm": 0.9110273718833923,
      "kl": 0.1287841796875,
      "learning_rate": 8.987250199168808e-07,
      "loss": 0.0875,
      "reward": 0.29848775546997786,
      "reward_std": 0.7376701682806015,
      "rewards/cosine_scaled_reward": -0.06206565350294113,
      "rewards/format_reward": 0.4226190596818924,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2368.2500915527344,
      "epoch": 0.596,
      "grad_norm": 0.5145498514175415,
      "kl": 0.142333984375,
      "learning_rate": 8.967309592491052e-07,
      "loss": -0.0091,
      "reward": 0.1381237395107746,
      "reward_std": 0.7537627294659615,
      "rewards/cosine_scaled_reward": -0.12736669927835464,
      "rewards/format_reward": 0.3928571492433548,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2541.4226684570312,
      "epoch": 0.6,
      "grad_norm": 0.4558282792568207,
      "kl": 0.1424560546875,
      "learning_rate": 8.9471999940354e-07,
      "loss": 0.0545,
      "reward": 0.517300067236647,
      "reward_std": 0.8328704386949539,
      "rewards/cosine_scaled_reward": 0.014602408395148814,
      "rewards/format_reward": 0.4880952388048172,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2635.4226989746094,
      "epoch": 0.604,
      "grad_norm": 0.3748377859592438,
      "kl": 0.18310546875,
      "learning_rate": 8.926922383915315e-07,
      "loss": 0.0372,
      "reward": 0.09937155619263649,
      "reward_std": 0.7007840871810913,
      "rewards/cosine_scaled_reward": -0.1259094497654587,
      "rewards/format_reward": 0.3511904776096344,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2366.8036499023438,
      "epoch": 0.608,
      "grad_norm": 0.7343178391456604,
      "kl": 0.196533203125,
      "learning_rate": 8.906477750432903e-07,
      "loss": 0.1069,
      "reward": 0.6113147716969252,
      "reward_std": 0.8982192724943161,
      "rewards/cosine_scaled_reward": 0.028871658723801374,
      "rewards/format_reward": 0.5535714402794838,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2539.6726684570312,
      "epoch": 0.612,
      "grad_norm": 0.3661244213581085,
      "kl": 0.23095703125,
      "learning_rate": 8.88586709003076e-07,
      "loss": 0.0493,
      "reward": 0.3096798346377909,
      "reward_std": 0.6143878847360611,
      "rewards/cosine_scaled_reward": -0.07432675641030073,
      "rewards/format_reward": 0.4583333432674408,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2693.077392578125,
      "epoch": 0.616,
      "grad_norm": 0.39779341220855713,
      "kl": 0.26123046875,
      "learning_rate": 8.865091407243394e-07,
      "loss": 0.0484,
      "reward": 0.20376494899392128,
      "reward_std": 0.7454717755317688,
      "rewards/cosine_scaled_reward": -0.0856175352819264,
      "rewards/format_reward": 0.3750000074505806,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2522.8333740234375,
      "epoch": 0.62,
      "grad_norm": 0.7077339291572571,
      "kl": 0.275634765625,
      "learning_rate": 8.844151714648274e-07,
      "loss": 0.1048,
      "reward": 0.28493453562259674,
      "reward_std": 0.7751601040363312,
      "rewards/cosine_scaled_reward": -0.050985115580260754,
      "rewards/format_reward": 0.3869047649204731,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2791.2083740234375,
      "epoch": 0.624,
      "grad_norm": 0.6277625560760498,
      "kl": 0.3359375,
      "learning_rate": 8.823049032816478e-07,
      "loss": 0.063,
      "reward": 0.15741928666830063,
      "reward_std": 0.7891719415783882,
      "rewards/cosine_scaled_reward": -0.1058141621761024,
      "rewards/format_reward": 0.3690476268529892,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2756.0596313476562,
      "epoch": 0.628,
      "grad_norm": 0.9464259147644043,
      "kl": 0.35107421875,
      "learning_rate": 8.801784390262943e-07,
      "loss": 0.1337,
      "reward": 0.20047340355813503,
      "reward_std": 0.7717511355876923,
      "rewards/cosine_scaled_reward": -0.10214426182210445,
      "rewards/format_reward": 0.4047619178891182,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2546.386993408203,
      "epoch": 0.632,
      "grad_norm": 0.9672547578811646,
      "kl": 0.3583984375,
      "learning_rate": 8.780358823396352e-07,
      "loss": 0.1309,
      "reward": 0.3896455895155668,
      "reward_std": 0.8362017869949341,
      "rewards/cosine_scaled_reward": -0.025415319949388504,
      "rewards/format_reward": 0.4404761902987957,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2490.3869018554688,
      "epoch": 0.636,
      "grad_norm": 0.5016924738883972,
      "kl": 0.37744140625,
      "learning_rate": 8.758773376468604e-07,
      "loss": 0.0548,
      "reward": 0.3971053212881088,
      "reward_std": 0.6817308068275452,
      "rewards/cosine_scaled_reward": -0.07823306252248585,
      "rewards/format_reward": 0.5535714328289032,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2715.90478515625,
      "epoch": 0.64,
      "grad_norm": 0.776878833770752,
      "kl": 0.4169921875,
      "learning_rate": 8.737029101523929e-07,
      "loss": 0.1414,
      "reward": 0.37737663462758064,
      "reward_std": 0.8348212540149689,
      "rewards/cosine_scaled_reward": -0.03452597954310477,
      "rewards/format_reward": 0.4464285746216774,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2636.0357666015625,
      "epoch": 0.644,
      "grad_norm": 1.2749825716018677,
      "kl": 0.48486328125,
      "learning_rate": 8.715127058347614e-07,
      "loss": 0.1445,
      "reward": 0.24750607460737228,
      "reward_std": 0.7917188853025436,
      "rewards/cosine_scaled_reward": -0.10541364271193743,
      "rewards/format_reward": 0.4583333469927311,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2622.9405517578125,
      "epoch": 0.648,
      "grad_norm": 1.3737562894821167,
      "kl": 0.5888671875,
      "learning_rate": 8.693068314414344e-07,
      "loss": 0.1549,
      "reward": 0.10282446062774397,
      "reward_std": 0.6833581179380417,
      "rewards/cosine_scaled_reward": -0.18073063343763351,
      "rewards/format_reward": 0.4642857275903225,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2187.9286193847656,
      "epoch": 0.652,
      "grad_norm": 1.2476062774658203,
      "kl": 0.64453125,
      "learning_rate": 8.670853944836176e-07,
      "loss": 0.1442,
      "reward": 0.5821249708533287,
      "reward_std": 0.8525291532278061,
      "rewards/cosine_scaled_reward": -0.0303660926874727,
      "rewards/format_reward": 0.6428571492433548,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2645.2202758789062,
      "epoch": 0.656,
      "grad_norm": 0.8759295344352722,
      "kl": 0.83203125,
      "learning_rate": 8.648485032310144e-07,
      "loss": 0.1369,
      "reward": 0.354750145226717,
      "reward_std": 0.6708278656005859,
      "rewards/cosine_scaled_reward": -0.09941063448786736,
      "rewards/format_reward": 0.5535714328289032,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2744.5952758789062,
      "epoch": 0.66,
      "grad_norm": 1.443908452987671,
      "kl": 0.9365234375,
      "learning_rate": 8.625962667065487e-07,
      "loss": 0.1514,
      "reward": 0.07671361323446035,
      "reward_std": 0.7401341199874878,
      "rewards/cosine_scaled_reward": -0.16997654270380735,
      "rewards/format_reward": 0.4166666679084301,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2762.2381591796875,
      "epoch": 0.664,
      "grad_norm": 2.171701192855835,
      "kl": 1.064453125,
      "learning_rate": 8.603287946810513e-07,
      "loss": 0.0493,
      "reward": 0.3810354620218277,
      "reward_std": 0.6359066590666771,
      "rewards/cosine_scaled_reward": -0.05650608614087105,
      "rewards/format_reward": 0.4940476231276989,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2410.6726684570312,
      "epoch": 0.668,
      "grad_norm": 1.1915135383605957,
      "kl": 0.9716796875,
      "learning_rate": 8.580461976679099e-07,
      "loss": 0.1178,
      "reward": 0.5956609398126602,
      "reward_std": 0.7429262697696686,
      "rewards/cosine_scaled_reward": -0.011693337932229042,
      "rewards/format_reward": 0.6190476268529892,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2624.7083740234375,
      "epoch": 0.672,
      "grad_norm": 1.2750567197799683,
      "kl": 1.111328125,
      "learning_rate": 8.557485869176825e-07,
      "loss": 0.1676,
      "reward": 0.35937594436109066,
      "reward_std": 0.7485721707344055,
      "rewards/cosine_scaled_reward": -0.0613834522664547,
      "rewards/format_reward": 0.482142873108387,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2566.7857666015625,
      "epoch": 0.676,
      "grad_norm": 0.8985515832901001,
      "kl": 1.0439453125,
      "learning_rate": 8.534360744126753e-07,
      "loss": 0.1232,
      "reward": 0.23157138470560312,
      "reward_std": 0.6288014650344849,
      "rewards/cosine_scaled_reward": -0.14314288273453712,
      "rewards/format_reward": 0.5178571492433548,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2820.5059814453125,
      "epoch": 0.68,
      "grad_norm": 1.1454522609710693,
      "kl": 0.9677734375,
      "learning_rate": 8.511087728614862e-07,
      "loss": 0.1412,
      "reward": 0.08721911488100886,
      "reward_std": 0.6948041319847107,
      "rewards/cosine_scaled_reward": -0.16769996285438538,
      "rewards/format_reward": 0.4226190522313118,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2376.166748046875,
      "epoch": 0.684,
      "grad_norm": 0.9355194568634033,
      "kl": 0.9521484375,
      "learning_rate": 8.487667956935087e-07,
      "loss": 0.128,
      "reward": 0.41750996466726065,
      "reward_std": 0.7085302621126175,
      "rewards/cosine_scaled_reward": -0.05910217575728893,
      "rewards/format_reward": 0.535714291036129,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2571.5000610351562,
      "epoch": 0.688,
      "grad_norm": 0.9496890902519226,
      "kl": 1.0341796875,
      "learning_rate": 8.464102570534061e-07,
      "loss": 0.147,
      "reward": 0.21527537889778614,
      "reward_std": 0.6487467139959335,
      "rewards/cosine_scaled_reward": -0.2048623152077198,
      "rewards/format_reward": 0.6250000149011612,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2577.839324951172,
      "epoch": 0.692,
      "grad_norm": 1.125106692314148,
      "kl": 1.005859375,
      "learning_rate": 8.440392717955475e-07,
      "loss": 0.1126,
      "reward": 0.29065654147416353,
      "reward_std": 0.5777322202920914,
      "rewards/cosine_scaled_reward": -0.11955267190933228,
      "rewards/format_reward": 0.5297619178891182,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2297.0416870117188,
      "epoch": 0.696,
      "grad_norm": 1.5477794408798218,
      "kl": 0.9482421875,
      "learning_rate": 8.416539554784089e-07,
      "loss": 0.0866,
      "reward": 0.35003964975476265,
      "reward_std": 0.7120198756456375,
      "rewards/cosine_scaled_reward": -0.13152779638767242,
      "rewards/format_reward": 0.6130952388048172,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2239.952423095703,
      "epoch": 0.7,
      "grad_norm": 1.1404165029525757,
      "kl": 0.7734375,
      "learning_rate": 8.392544243589427e-07,
      "loss": 0.1326,
      "reward": 0.7693988904356956,
      "reward_std": 0.8029063045978546,
      "rewards/cosine_scaled_reward": 0.0543422931805253,
      "rewards/format_reward": 0.6607142984867096,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2214.148895263672,
      "epoch": 0.704,
      "grad_norm": 0.976016104221344,
      "kl": 0.8193359375,
      "learning_rate": 8.368407953869103e-07,
      "loss": 0.1005,
      "reward": 0.5222894381731749,
      "reward_std": 0.6858630776405334,
      "rewards/cosine_scaled_reward": -0.07814099243842065,
      "rewards/format_reward": 0.6785714477300644,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2167.4345092773438,
      "epoch": 0.708,
      "grad_norm": 1.6724809408187866,
      "kl": 0.740234375,
      "learning_rate": 8.344131861991828e-07,
      "loss": 0.1424,
      "reward": 0.3468378521502018,
      "reward_std": 0.6407709717750549,
      "rewards/cosine_scaled_reward": -0.16289059445261955,
      "rewards/format_reward": 0.6726190596818924,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2593.5654907226562,
      "epoch": 0.712,
      "grad_norm": 1.3712421655654907,
      "kl": 0.9814453125,
      "learning_rate": 8.319717151140072e-07,
      "loss": 0.1121,
      "reward": 0.27433447539806366,
      "reward_std": 0.6857093423604965,
      "rewards/cosine_scaled_reward": -0.16342800296843052,
      "rewards/format_reward": 0.6011904925107956,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2240.9583129882812,
      "epoch": 0.716,
      "grad_norm": 2.2109479904174805,
      "kl": 0.7880859375,
      "learning_rate": 8.295165011252396e-07,
      "loss": 0.0613,
      "reward": 0.3249462991952896,
      "reward_std": 0.7396285533905029,
      "rewards/cosine_scaled_reward": -0.12919352855533361,
      "rewards/format_reward": 0.5833333358168602,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2391.261962890625,
      "epoch": 0.72,
      "grad_norm": 0.9252892136573792,
      "kl": 0.8369140625,
      "learning_rate": 8.270476638965461e-07,
      "loss": 0.0766,
      "reward": 0.37066294252872467,
      "reward_std": 0.5772489011287689,
      "rewards/cosine_scaled_reward": -0.15395426377654076,
      "rewards/format_reward": 0.6785714477300644,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2188.21435546875,
      "epoch": 0.724,
      "grad_norm": 1.4679890871047974,
      "kl": 0.7177734375,
      "learning_rate": 8.245653237555705e-07,
      "loss": 0.1271,
      "reward": 0.47163213789463043,
      "reward_std": 0.7110278159379959,
      "rewards/cosine_scaled_reward": -0.10942202992737293,
      "rewards/format_reward": 0.6904762089252472,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2330.3572387695312,
      "epoch": 0.728,
      "grad_norm": 0.8398174047470093,
      "kl": 0.71875,
      "learning_rate": 8.220696016880687e-07,
      "loss": 0.0837,
      "reward": 0.5167603380978107,
      "reward_std": 0.704664558172226,
      "rewards/cosine_scaled_reward": -0.08090554922819138,
      "rewards/format_reward": 0.6785714402794838,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2319.154815673828,
      "epoch": 0.732,
      "grad_norm": 1.0028657913208008,
      "kl": 0.7421875,
      "learning_rate": 8.195606193320136e-07,
      "loss": 0.1069,
      "reward": 0.6520561873912811,
      "reward_std": 0.8034340292215347,
      "rewards/cosine_scaled_reward": -0.04301954247057438,
      "rewards/format_reward": 0.7380952537059784,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2419.6131591796875,
      "epoch": 0.736,
      "grad_norm": 0.9799902439117432,
      "kl": 0.794921875,
      "learning_rate": 8.170384989716657e-07,
      "loss": 0.1,
      "reward": 0.5134465768933296,
      "reward_std": 0.7416307479143143,
      "rewards/cosine_scaled_reward": -0.10637196339666843,
      "rewards/format_reward": 0.7261904776096344,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2279.3630981445312,
      "epoch": 0.74,
      "grad_norm": 1.1403197050094604,
      "kl": 0.75390625,
      "learning_rate": 8.145033635316128e-07,
      "loss": 0.08,
      "reward": 0.5693989507853985,
      "reward_std": 0.6981105357408524,
      "rewards/cosine_scaled_reward": -0.06946719996631145,
      "rewards/format_reward": 0.7083333507180214,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2087.2679443359375,
      "epoch": 0.744,
      "grad_norm": 0.8785580992698669,
      "kl": 0.6123046875,
      "learning_rate": 8.119553365707802e-07,
      "loss": 0.0849,
      "reward": 0.4244233965873718,
      "reward_std": 0.718925341963768,
      "rewards/cosine_scaled_reward": -0.1449311599135399,
      "rewards/format_reward": 0.7142857313156128,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2415.8036499023438,
      "epoch": 0.748,
      "grad_norm": 1.325434684753418,
      "kl": 0.6298828125,
      "learning_rate": 8.093945422764069e-07,
      "loss": 0.0477,
      "reward": 0.594460990279913,
      "reward_std": 0.7041322290897369,
      "rewards/cosine_scaled_reward": -0.021221883594989777,
      "rewards/format_reward": 0.636904776096344,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2371.0714721679688,
      "epoch": 0.752,
      "grad_norm": 1.3853912353515625,
      "kl": 0.638671875,
      "learning_rate": 8.068211054579943e-07,
      "loss": 0.1131,
      "reward": 0.5956445932388306,
      "reward_std": 0.7780069708824158,
      "rewards/cosine_scaled_reward": -0.062296761316247284,
      "rewards/format_reward": 0.7202381044626236,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2243.3333740234375,
      "epoch": 0.756,
      "grad_norm": 0.7066504955291748,
      "kl": 0.564453125,
      "learning_rate": 8.04235151541222e-07,
      "loss": 0.043,
      "reward": 0.7391829118132591,
      "reward_std": 0.6626263409852982,
      "rewards/cosine_scaled_reward": -0.014337139204144478,
      "rewards/format_reward": 0.767857164144516,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2111.875030517578,
      "epoch": 0.76,
      "grad_norm": 1.1808303594589233,
      "kl": 0.5361328125,
      "learning_rate": 8.01636806561836e-07,
      "loss": 0.0212,
      "reward": 0.6303885579109192,
      "reward_std": 0.7266089022159576,
      "rewards/cosine_scaled_reward": -0.04790095146745443,
      "rewards/format_reward": 0.7261904925107956,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2512.1607666015625,
      "epoch": 0.764,
      "grad_norm": 1.169936180114746,
      "kl": 0.54736328125,
      "learning_rate": 7.990261971595048e-07,
      "loss": 0.0239,
      "reward": 0.4208872392773628,
      "reward_std": 0.6789906620979309,
      "rewards/cosine_scaled_reward": -0.12288972595706582,
      "rewards/format_reward": 0.6666666716337204,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2421.7381591796875,
      "epoch": 0.768,
      "grad_norm": 1.9125944375991821,
      "kl": 0.44970703125,
      "learning_rate": 7.964034505716476e-07,
      "loss": 0.162,
      "reward": 0.6703099310398102,
      "reward_std": 0.7079124301671982,
      "rewards/cosine_scaled_reward": 0.022654948756098747,
      "rewards/format_reward": 0.625,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2342.3333435058594,
      "epoch": 0.772,
      "grad_norm": 1.1848394870758057,
      "kl": 0.4287109375,
      "learning_rate": 7.93768694627233e-07,
      "loss": 0.1217,
      "reward": 0.3946942985057831,
      "reward_std": 0.7293716818094254,
      "rewards/cosine_scaled_reward": -0.14789094775915146,
      "rewards/format_reward": 0.6904762089252472,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2488.1786193847656,
      "epoch": 0.776,
      "grad_norm": 0.8427687883377075,
      "kl": 0.40673828125,
      "learning_rate": 7.911220577405484e-07,
      "loss": 0.0681,
      "reward": 0.33857931289821863,
      "reward_std": 0.7693478316068649,
      "rewards/cosine_scaled_reward": -0.11642462853342295,
      "rewards/format_reward": 0.5714285671710968,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2235.4762573242188,
      "epoch": 0.78,
      "grad_norm": 1.9778449535369873,
      "kl": 0.4599609375,
      "learning_rate": 7.884636689049422e-07,
      "loss": 0.1203,
      "reward": 0.7276730462908745,
      "reward_std": 0.8504652380943298,
      "rewards/cosine_scaled_reward": 0.02455079648643732,
      "rewards/format_reward": 0.6785714328289032,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2252.7262268066406,
      "epoch": 0.784,
      "grad_norm": 1.251224398612976,
      "kl": 0.49169921875,
      "learning_rate": 7.857936576865356e-07,
      "loss": 0.0753,
      "reward": 0.6360676661133766,
      "reward_std": 0.8185366541147232,
      "rewards/cosine_scaled_reward": -0.01827568793669343,
      "rewards/format_reward": 0.6726190596818924,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2399.7500610351562,
      "epoch": 0.788,
      "grad_norm": 0.9470409154891968,
      "kl": 0.517578125,
      "learning_rate": 7.831121542179086e-07,
      "loss": 0.1036,
      "reward": 0.550631508231163,
      "reward_std": 0.7208298593759537,
      "rewards/cosine_scaled_reward": -0.037184251472353935,
      "rewards/format_reward": 0.625,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2269.5000610351562,
      "epoch": 0.792,
      "grad_norm": 2.047698974609375,
      "kl": 0.595703125,
      "learning_rate": 7.804192891917571e-07,
      "loss": 0.1831,
      "reward": 0.29151881486177444,
      "reward_std": 0.666583925485611,
      "rewards/cosine_scaled_reward": -0.15483582392334938,
      "rewards/format_reward": 0.601190485060215,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2316.7857666015625,
      "epoch": 0.796,
      "grad_norm": 1.6713296175003052,
      "kl": 0.60205078125,
      "learning_rate": 7.777151938545235e-07,
      "loss": 0.1356,
      "reward": 0.5018086154013872,
      "reward_std": 0.8012387007474899,
      "rewards/cosine_scaled_reward": -0.0615957040572539,
      "rewards/format_reward": 0.6250000223517418,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2471.1964721679688,
      "epoch": 0.8,
      "grad_norm": 0.9633775949478149,
      "kl": 0.740234375,
      "learning_rate": 7.75e-07,
      "loss": 0.1277,
      "reward": 0.3792301341891289,
      "reward_std": 0.76199010014534,
      "rewards/cosine_scaled_reward": -0.11693255044519901,
      "rewards/format_reward": 0.6130952537059784,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2280.6607971191406,
      "epoch": 0.804,
      "grad_norm": 1.1369765996932983,
      "kl": 0.7587890625,
      "learning_rate": 7.72273839962904e-07,
      "loss": 0.1174,
      "reward": 0.4361310079693794,
      "reward_std": 0.7977508455514908,
      "rewards/cosine_scaled_reward": -0.0736011671833694,
      "rewards/format_reward": 0.5833333432674408,
      "step": 201
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2239.3809814453125,
      "epoch": 0.808,
      "grad_norm": 1.1852681636810303,
      "kl": 0.80078125,
      "learning_rate": 7.695368466124296e-07,
      "loss": 0.1861,
      "reward": 0.4273875653743744,
      "reward_std": 0.7939650565385818,
      "rewards/cosine_scaled_reward": -0.08987765479832888,
      "rewards/format_reward": 0.6071428656578064,
      "step": 202
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2270.9524536132812,
      "epoch": 0.812,
      "grad_norm": 2.2510244846343994,
      "kl": 1.05859375,
      "learning_rate": 7.667891533457718e-07,
      "loss": 0.1778,
      "reward": 0.5268369093537331,
      "reward_std": 0.7606751769781113,
      "rewards/cosine_scaled_reward": -0.05205773119814694,
      "rewards/format_reward": 0.630952388048172,
      "step": 203
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2307.83935546875,
      "epoch": 0.816,
      "grad_norm": 3.0754034519195557,
      "kl": 1.107421875,
      "learning_rate": 7.640308940816239e-07,
      "loss": 0.1251,
      "reward": 0.046380717772990465,
      "reward_std": 0.6517826318740845,
      "rewards/cosine_scaled_reward": -0.23573821783065796,
      "rewards/format_reward": 0.5178571566939354,
      "step": 204
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2065.482177734375,
      "epoch": 0.82,
      "grad_norm": 3.317054033279419,
      "kl": 0.8037109375,
      "learning_rate": 7.612622032536507e-07,
      "loss": 0.1229,
      "reward": 0.629617914557457,
      "reward_std": 0.7360707223415375,
      "rewards/cosine_scaled_reward": -0.012572012841701508,
      "rewards/format_reward": 0.654761902987957,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2119.2679443359375,
      "epoch": 0.824,
      "grad_norm": 1.985148310661316,
      "kl": 0.70849609375,
      "learning_rate": 7.584832158039378e-07,
      "loss": 0.1697,
      "reward": 0.4503296762704849,
      "reward_std": 0.7717154771089554,
      "rewards/cosine_scaled_reward": -0.07840658072382212,
      "rewards/format_reward": 0.6071428656578064,
      "step": 206
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2085.839324951172,
      "epoch": 0.828,
      "grad_norm": 2.4033172130584717,
      "kl": 0.6025390625,
      "learning_rate": 7.556940671764124e-07,
      "loss": 0.1578,
      "reward": 0.4145805863663554,
      "reward_std": 0.7361099421977997,
      "rewards/cosine_scaled_reward": -0.11116209626197815,
      "rewards/format_reward": 0.6369047611951828,
      "step": 207
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1872.482177734375,
      "epoch": 0.832,
      "grad_norm": 2.11576247215271,
      "kl": 0.408203125,
      "learning_rate": 7.528948933102438e-07,
      "loss": 0.0839,
      "reward": 0.4670650511980057,
      "reward_std": 0.7250475585460663,
      "rewards/cosine_scaled_reward": -0.10872937482781708,
      "rewards/format_reward": 0.6845238208770752,
      "step": 208
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2085.6786499023438,
      "epoch": 0.836,
      "grad_norm": 0.8786793351173401,
      "kl": 0.51806640625,
      "learning_rate": 7.500858306332172e-07,
      "loss": 0.0649,
      "reward": 0.46545055881142616,
      "reward_std": 0.6805593073368073,
      "rewards/cosine_scaled_reward": -0.09465568419545889,
      "rewards/format_reward": 0.6547619104385376,
      "step": 209
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2425.7916870117188,
      "epoch": 0.84,
      "grad_norm": 1.3337445259094238,
      "kl": 0.58837890625,
      "learning_rate": 7.472670160550848e-07,
      "loss": 0.1561,
      "reward": 0.4684627018868923,
      "reward_std": 0.824245348572731,
      "rewards/cosine_scaled_reward": -0.04553056287113577,
      "rewards/format_reward": 0.5595238208770752,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2630.5655517578125,
      "epoch": 0.844,
      "grad_norm": 1.3039979934692383,
      "kl": 0.732421875,
      "learning_rate": 7.444385869608921e-07,
      "loss": 0.1559,
      "reward": 0.1796425711363554,
      "reward_std": 0.6979469060897827,
      "rewards/cosine_scaled_reward": -0.17803586274385452,
      "rewards/format_reward": 0.5357143059372902,
      "step": 211
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1983.9762268066406,
      "epoch": 0.848,
      "grad_norm": 0.9129418134689331,
      "kl": 0.546875,
      "learning_rate": 7.416006812042827e-07,
      "loss": 0.1352,
      "reward": 0.4564796891063452,
      "reward_std": 0.6133182421326637,
      "rewards/cosine_scaled_reward": -0.11402205377817154,
      "rewards/format_reward": 0.6845238208770752,
      "step": 212
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2315.5119018554688,
      "epoch": 0.852,
      "grad_norm": 1.2220977544784546,
      "kl": 0.5751953125,
      "learning_rate": 7.387534371007797e-07,
      "loss": 0.1683,
      "reward": 0.6708191484212875,
      "reward_std": 0.9547160714864731,
      "rewards/cosine_scaled_reward": 0.016957183834165335,
      "rewards/format_reward": 0.636904776096344,
      "step": 213
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2233.0655212402344,
      "epoch": 0.856,
      "grad_norm": 0.7978451251983643,
      "kl": 0.61474609375,
      "learning_rate": 7.358969934210438e-07,
      "loss": 0.1304,
      "reward": 0.40765415877103806,
      "reward_std": 0.7158278822898865,
      "rewards/cosine_scaled_reward": -0.12057768838712946,
      "rewards/format_reward": 0.6488095372915268,
      "step": 214
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2194.791748046875,
      "epoch": 0.86,
      "grad_norm": 1.0176509618759155,
      "kl": 0.61181640625,
      "learning_rate": 7.330314893841101e-07,
      "loss": 0.0991,
      "reward": 0.5912733934819698,
      "reward_std": 0.6540912538766861,
      "rewards/cosine_scaled_reward": -0.013887112960219383,
      "rewards/format_reward": 0.6190476194024086,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2189.5535888671875,
      "epoch": 0.864,
      "grad_norm": 0.7862021923065186,
      "kl": 0.6572265625,
      "learning_rate": 7.301570646506027e-07,
      "loss": 0.1369,
      "reward": 0.4810000769793987,
      "reward_std": 0.6697472035884857,
      "rewards/cosine_scaled_reward": -0.09283328615128994,
      "rewards/format_reward": 0.6666666716337204,
      "step": 216
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2238.6487731933594,
      "epoch": 0.868,
      "grad_norm": 0.675116240978241,
      "kl": 0.662109375,
      "learning_rate": 7.27273859315928e-07,
      "loss": 0.1234,
      "reward": 0.3597661480307579,
      "reward_std": 0.6638298779726028,
      "rewards/cosine_scaled_reward": -0.13559313118457794,
      "rewards/format_reward": 0.6309523731470108,
      "step": 217
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2330.7560424804688,
      "epoch": 0.872,
      "grad_norm": 0.7294526696205139,
      "kl": 0.6875,
      "learning_rate": 7.243820139034464e-07,
      "loss": 0.1182,
      "reward": 0.5070892386138439,
      "reward_std": 0.770987793803215,
      "rewards/cosine_scaled_reward": -0.029193488880991936,
      "rewards/format_reward": 0.565476194024086,
      "step": 218
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2425.4048461914062,
      "epoch": 0.876,
      "grad_norm": 0.9955194592475891,
      "kl": 0.76171875,
      "learning_rate": 7.214816693576234e-07,
      "loss": 0.145,
      "reward": 0.3850390911102295,
      "reward_std": 0.72886823117733,
      "rewards/cosine_scaled_reward": -0.11700426135212183,
      "rewards/format_reward": 0.6190476417541504,
      "step": 219
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2443.869110107422,
      "epoch": 0.88,
      "grad_norm": 0.8245673179626465,
      "kl": 0.7412109375,
      "learning_rate": 7.185729670371604e-07,
      "loss": 0.1517,
      "reward": 0.3367026010528207,
      "reward_std": 0.6719767898321152,
      "rewards/cosine_scaled_reward": -0.1471248921006918,
      "rewards/format_reward": 0.630952388048172,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2401.65478515625,
      "epoch": 0.884,
      "grad_norm": 0.6434879302978516,
      "kl": 0.552734375,
      "learning_rate": 7.156560487081051e-07,
      "loss": 0.0964,
      "reward": 0.5589644331485033,
      "reward_std": 0.6387112140655518,
      "rewards/cosine_scaled_reward": -0.030041599762625992,
      "rewards/format_reward": 0.619047611951828,
      "step": 221
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2159.9107666015625,
      "epoch": 0.888,
      "grad_norm": 0.8747764229774475,
      "kl": 0.48974609375,
      "learning_rate": 7.127310565369415e-07,
      "loss": 0.0648,
      "reward": 1.0575831979513168,
      "reward_std": 0.8345089554786682,
      "rewards/cosine_scaled_reward": 0.15379157848656178,
      "rewards/format_reward": 0.7500000149011612,
      "step": 222
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2736.8095703125,
      "epoch": 0.892,
      "grad_norm": 1.644534707069397,
      "kl": 0.650390625,
      "learning_rate": 7.097981330836616e-07,
      "loss": 0.0898,
      "reward": 0.28782752249389887,
      "reward_std": 0.6842672526836395,
      "rewards/cosine_scaled_reward": -0.11799101112410426,
      "rewards/format_reward": 0.5238095298409462,
      "step": 223
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2585.4464721679688,
      "epoch": 0.896,
      "grad_norm": 0.5411848425865173,
      "kl": 0.603515625,
      "learning_rate": 7.068574212948169e-07,
      "loss": 0.1243,
      "reward": 0.49723897874355316,
      "reward_std": 0.810086615383625,
      "rewards/cosine_scaled_reward": -0.07280909270048141,
      "rewards/format_reward": 0.6428571492433548,
      "step": 224
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2394.8988647460938,
      "epoch": 0.9,
      "grad_norm": 0.7165555357933044,
      "kl": 0.52783203125,
      "learning_rate": 7.039090644965509e-07,
      "loss": 0.0888,
      "reward": 0.5129196643829346,
      "reward_std": 0.787805512547493,
      "rewards/cosine_scaled_reward": -0.056040180614218116,
      "rewards/format_reward": 0.6250000298023224,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2482.7678833007812,
      "epoch": 0.904,
      "grad_norm": 0.5211958289146423,
      "kl": 0.51416015625,
      "learning_rate": 7.009532063876148e-07,
      "loss": 0.0812,
      "reward": 0.4906727410852909,
      "reward_std": 0.7880082875490189,
      "rewards/cosine_scaled_reward": -0.0641874436987564,
      "rewards/format_reward": 0.6190476417541504,
      "step": 226
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2290.0000610351562,
      "epoch": 0.908,
      "grad_norm": 0.5630519986152649,
      "kl": 0.382568359375,
      "learning_rate": 6.979899910323624e-07,
      "loss": 0.1034,
      "reward": 0.6861637309193611,
      "reward_std": 0.7359699308872223,
      "rewards/cosine_scaled_reward": -0.049775293562561274,
      "rewards/format_reward": 0.7857142984867096,
      "step": 227
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2686.7440795898438,
      "epoch": 0.912,
      "grad_norm": 0.6647688746452332,
      "kl": 0.4326171875,
      "learning_rate": 6.950195628537299e-07,
      "loss": 0.0594,
      "reward": 0.5352285588160157,
      "reward_std": 0.7634364515542984,
      "rewards/cosine_scaled_reward": -0.047861908678896725,
      "rewards/format_reward": 0.6309524029493332,
      "step": 228
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2687.5416564941406,
      "epoch": 0.916,
      "grad_norm": 0.37424567341804504,
      "kl": 0.39208984375,
      "learning_rate": 6.920420666261961e-07,
      "loss": 0.0465,
      "reward": 0.43462158273905516,
      "reward_std": 0.6648337990045547,
      "rewards/cosine_scaled_reward": -0.1070939814671874,
      "rewards/format_reward": 0.6488095298409462,
      "step": 229
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2462.8452758789062,
      "epoch": 0.92,
      "grad_norm": 0.52641361951828,
      "kl": 0.37158203125,
      "learning_rate": 6.890576474687263e-07,
      "loss": 0.1061,
      "reward": 0.5536616146564484,
      "reward_std": 0.6706894189119339,
      "rewards/cosine_scaled_reward": -0.06840727850794792,
      "rewards/format_reward": 0.690476194024086,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2395.279754638672,
      "epoch": 0.924,
      "grad_norm": 0.5165700912475586,
      "kl": 0.369140625,
      "learning_rate": 6.860664508377001e-07,
      "loss": 0.0951,
      "reward": 0.4786584824323654,
      "reward_std": 0.774825245141983,
      "rewards/cosine_scaled_reward": -0.1267421804368496,
      "rewards/format_reward": 0.7321428656578064,
      "step": 231
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2468.2857666015625,
      "epoch": 0.928,
      "grad_norm": 0.4581441879272461,
      "kl": 0.31591796875,
      "learning_rate": 6.83068622519821e-07,
      "loss": 0.0555,
      "reward": 0.6299031171947718,
      "reward_std": 0.7808382511138916,
      "rewards/cosine_scaled_reward": -0.045167478267103434,
      "rewards/format_reward": 0.7202381044626236,
      "step": 232
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2712.58935546875,
      "epoch": 0.932,
      "grad_norm": 0.7744795083999634,
      "kl": 0.333984375,
      "learning_rate": 6.800643086250121e-07,
      "loss": 0.0623,
      "reward": 0.6912369206547737,
      "reward_std": 0.7789230197668076,
      "rewards/cosine_scaled_reward": 0.04204704426229,
      "rewards/format_reward": 0.607142873108387,
      "step": 233
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2556.619110107422,
      "epoch": 0.936,
      "grad_norm": 0.8385416865348816,
      "kl": 0.38427734375,
      "learning_rate": 6.770536555792944e-07,
      "loss": 0.0662,
      "reward": 0.4830031730234623,
      "reward_std": 0.7291474640369415,
      "rewards/cosine_scaled_reward": -0.07397460378706455,
      "rewards/format_reward": 0.6309523731470108,
      "step": 234
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2724.8809814453125,
      "epoch": 0.94,
      "grad_norm": 0.6943939328193665,
      "kl": 0.330078125,
      "learning_rate": 6.740368101176495e-07,
      "loss": 0.1008,
      "reward": 0.38701344281435013,
      "reward_std": 0.7834271490573883,
      "rewards/cosine_scaled_reward": -0.11304090730845928,
      "rewards/format_reward": 0.6130952388048172,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2819.7738647460938,
      "epoch": 0.944,
      "grad_norm": 0.3916683495044708,
      "kl": 0.32177734375,
      "learning_rate": 6.710139192768694e-07,
      "loss": 0.0365,
      "reward": 0.5249419808387756,
      "reward_std": 0.8138006925582886,
      "rewards/cosine_scaled_reward": -0.023243289440870285,
      "rewards/format_reward": 0.571428582072258,
      "step": 236
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2499.0536499023438,
      "epoch": 0.948,
      "grad_norm": 0.9175835847854614,
      "kl": 0.321533203125,
      "learning_rate": 6.679851303883891e-07,
      "loss": 0.1055,
      "reward": 0.6389507204294205,
      "reward_std": 0.8023868650197983,
      "rewards/cosine_scaled_reward": -0.04064369201660156,
      "rewards/format_reward": 0.7202381044626236,
      "step": 237
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2557.0655517578125,
      "epoch": 0.952,
      "grad_norm": 0.4397272765636444,
      "kl": 0.30810546875,
      "learning_rate": 6.649505910711058e-07,
      "loss": 0.0869,
      "reward": 0.4888541977852583,
      "reward_std": 0.7550098150968552,
      "rewards/cosine_scaled_reward": -0.09783481806516647,
      "rewards/format_reward": 0.684523805975914,
      "step": 238
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2600.422637939453,
      "epoch": 0.956,
      "grad_norm": 0.9344379305839539,
      "kl": 0.345703125,
      "learning_rate": 6.619104492241847e-07,
      "loss": 0.1329,
      "reward": 0.27865387313067913,
      "reward_std": 0.6713129729032516,
      "rewards/cosine_scaled_reward": -0.18210165202617645,
      "rewards/format_reward": 0.6428571417927742,
      "step": 239
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2331.8928833007812,
      "epoch": 0.96,
      "grad_norm": 0.5995355248451233,
      "kl": 0.326904296875,
      "learning_rate": 6.588648530198504e-07,
      "loss": 0.0705,
      "reward": 0.7613647617399693,
      "reward_std": 0.8133140057325363,
      "rewards/cosine_scaled_reward": 0.023539513116702437,
      "rewards/format_reward": 0.7142857164144516,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2355.52978515625,
      "epoch": 0.964,
      "grad_norm": 0.3258729875087738,
      "kl": 0.307861328125,
      "learning_rate": 6.558139508961654e-07,
      "loss": 0.0608,
      "reward": 0.6096780672669411,
      "reward_std": 0.7518916502594948,
      "rewards/cosine_scaled_reward": -0.05230383496382274,
      "rewards/format_reward": 0.7142857313156128,
      "step": 241
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2414.0834350585938,
      "epoch": 0.968,
      "grad_norm": 0.3521139919757843,
      "kl": 0.3212890625,
      "learning_rate": 6.527578915497951e-07,
      "loss": 0.0742,
      "reward": 0.6254040375351906,
      "reward_std": 0.8331593424081802,
      "rewards/cosine_scaled_reward": -0.03551226551644504,
      "rewards/format_reward": 0.696428582072258,
      "step": 242
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2138.107177734375,
      "epoch": 0.972,
      "grad_norm": 0.5599615573883057,
      "kl": 0.33251953125,
      "learning_rate": 6.496968239287603e-07,
      "loss": 0.0315,
      "reward": 0.8373362571001053,
      "reward_std": 0.6551230400800705,
      "rewards/cosine_scaled_reward": 0.04366813227534294,
      "rewards/format_reward": 0.7500000149011612,
      "step": 243
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2277.714324951172,
      "epoch": 0.976,
      "grad_norm": 0.6147165298461914,
      "kl": 0.336181640625,
      "learning_rate": 6.466308972251785e-07,
      "loss": 0.1075,
      "reward": 0.46155789494514465,
      "reward_std": 0.6391154229640961,
      "rewards/cosine_scaled_reward": -0.14124487387016416,
      "rewards/format_reward": 0.7440476417541504,
      "step": 244
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2401.232208251953,
      "epoch": 0.98,
      "grad_norm": 0.8454631567001343,
      "kl": 0.4814453125,
      "learning_rate": 6.435602608679916e-07,
      "loss": 0.0417,
      "reward": 0.5565547049045563,
      "reward_std": 0.6768698394298553,
      "rewards/cosine_scaled_reward": -0.04315121428226121,
      "rewards/format_reward": 0.6428571492433548,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2027.3453369140625,
      "epoch": 0.984,
      "grad_norm": 0.38341155648231506,
      "kl": 0.289794921875,
      "learning_rate": 6.404850645156841e-07,
      "loss": 0.0993,
      "reward": 0.7784423977136612,
      "reward_std": 0.6467820554971695,
      "rewards/cosine_scaled_reward": -0.009588314220309258,
      "rewards/format_reward": 0.7976190596818924,
      "step": 246
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2170.5416564941406,
      "epoch": 0.988,
      "grad_norm": 0.445024311542511,
      "kl": 0.4326171875,
      "learning_rate": 6.374054580489873e-07,
      "loss": 0.1244,
      "reward": 0.6971250772476196,
      "reward_std": 0.7919557690620422,
      "rewards/cosine_scaled_reward": -0.026437478853040375,
      "rewards/format_reward": 0.7500000149011612,
      "step": 247
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2060.2559814453125,
      "epoch": 0.992,
      "grad_norm": 0.49659866094589233,
      "kl": 0.36865234375,
      "learning_rate": 6.343215915635761e-07,
      "loss": 0.0959,
      "reward": 0.6287773251533508,
      "reward_std": 0.7386345416307449,
      "rewards/cosine_scaled_reward": -0.060611339285969734,
      "rewards/format_reward": 0.7500000149011612,
      "step": 248
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2148.869140625,
      "epoch": 0.996,
      "grad_norm": 0.4539166986942291,
      "kl": 0.3701171875,
      "learning_rate": 6.31233615362752e-07,
      "loss": 0.1019,
      "reward": 0.4996798560023308,
      "reward_std": 0.6163481399416924,
      "rewards/cosine_scaled_reward": -0.11027912324061617,
      "rewards/format_reward": 0.7202381044626236,
      "step": 249
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2330.3482971191406,
      "epoch": 1.0,
      "grad_norm": 0.5344291925430298,
      "kl": 0.5205078125,
      "learning_rate": 6.281416799501187e-07,
      "loss": 0.0866,
      "reward": 0.42578159645199776,
      "reward_std": 0.7348527163267136,
      "rewards/cosine_scaled_reward": -0.08472825400531292,
      "rewards/format_reward": 0.5952381044626236,
      "step": 250
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2106.059539794922,
      "epoch": 1.004,
      "grad_norm": 0.5930522680282593,
      "kl": 0.38232421875,
      "learning_rate": 6.25045936022246e-07,
      "loss": 0.1423,
      "reward": 0.5456876549869776,
      "reward_std": 0.6847013607621193,
      "rewards/cosine_scaled_reward": -0.0842990386299789,
      "rewards/format_reward": 0.7142857164144516,
      "step": 251
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2112.4762573242188,
      "epoch": 1.008,
      "grad_norm": 0.4610899090766907,
      "kl": 0.39111328125,
      "learning_rate": 6.219465344613258e-07,
      "loss": 0.0641,
      "reward": 0.6148004308342934,
      "reward_std": 0.6790047585964203,
      "rewards/cosine_scaled_reward": -0.05867121648043394,
      "rewards/format_reward": 0.7321428805589676,
      "step": 252
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2289.309600830078,
      "epoch": 1.012,
      "grad_norm": 0.3950199782848358,
      "kl": 0.387451171875,
      "learning_rate": 6.188436263278172e-07,
      "loss": 0.1336,
      "reward": 0.626802071928978,
      "reward_std": 0.6337872818112373,
      "rewards/cosine_scaled_reward": -0.028860883321613073,
      "rewards/format_reward": 0.6845238283276558,
      "step": 253
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2465.6130981445312,
      "epoch": 1.016,
      "grad_norm": 0.6084108352661133,
      "kl": 0.45947265625,
      "learning_rate": 6.157373628530852e-07,
      "loss": 0.0932,
      "reward": 0.6250473670661449,
      "reward_std": 0.7445118278264999,
      "rewards/cosine_scaled_reward": 2.3671891540288925e-05,
      "rewards/format_reward": 0.6250000074505806,
      "step": 254
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2127.8988647460938,
      "epoch": 1.02,
      "grad_norm": 0.8596522212028503,
      "kl": 0.368896484375,
      "learning_rate": 6.126278954320294e-07,
      "loss": 0.0589,
      "reward": 0.4597589522600174,
      "reward_std": 0.710930123925209,
      "rewards/cosine_scaled_reward": -0.1361919562332332,
      "rewards/format_reward": 0.7321428656578064,
      "step": 255
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2113.029815673828,
      "epoch": 1.024,
      "grad_norm": 0.6557802557945251,
      "kl": 0.39306640625,
      "learning_rate": 6.095153756157051e-07,
      "loss": 0.0808,
      "reward": 0.7969172149896622,
      "reward_std": 0.7165066450834274,
      "rewards/cosine_scaled_reward": 0.02643477637320757,
      "rewards/format_reward": 0.7440476268529892,
      "step": 256
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2350.4881591796875,
      "epoch": 1.028,
      "grad_norm": 0.7259902954101562,
      "kl": 0.37548828125,
      "learning_rate": 6.06399955103937e-07,
      "loss": 0.0556,
      "reward": 0.6144686937332153,
      "reward_std": 0.7161982655525208,
      "rewards/cosine_scaled_reward": -0.0052656568586826324,
      "rewards/format_reward": 0.6250000149011612,
      "step": 257
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2659.2500610351562,
      "epoch": 1.032,
      "grad_norm": 0.6974296569824219,
      "kl": 0.4482421875,
      "learning_rate": 6.032817857379256e-07,
      "loss": 0.1425,
      "reward": 0.38613639771938324,
      "reward_std": 0.7526693046092987,
      "rewards/cosine_scaled_reward": -0.10455084778368473,
      "rewards/format_reward": 0.595238097012043,
      "step": 258
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2219.6964721679688,
      "epoch": 1.036,
      "grad_norm": 0.5528798699378967,
      "kl": 0.33984375,
      "learning_rate": 6.001610194928464e-07,
      "loss": 0.1191,
      "reward": 0.6971464306116104,
      "reward_std": 0.7383679300546646,
      "rewards/cosine_scaled_reward": -0.023450596883776598,
      "rewards/format_reward": 0.744047611951828,
      "step": 259
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2010.1726989746094,
      "epoch": 1.04,
      "grad_norm": 0.36631372570991516,
      "kl": 0.30126953125,
      "learning_rate": 5.97037808470444e-07,
      "loss": 0.0699,
      "reward": 0.771461233496666,
      "reward_std": 0.5148339942097664,
      "rewards/cosine_scaled_reward": -0.01307891309261322,
      "rewards/format_reward": 0.7976190596818924,
      "step": 260
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2153.559539794922,
      "epoch": 1.044,
      "grad_norm": 0.48435378074645996,
      "kl": 0.3251953125,
      "learning_rate": 5.939123048916173e-07,
      "loss": 0.0931,
      "reward": 0.5015835016965866,
      "reward_std": 0.69777412712574,
      "rewards/cosine_scaled_reward": -0.121232058852911,
      "rewards/format_reward": 0.7440476268529892,
      "step": 261
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2309.1131591796875,
      "epoch": 1.048,
      "grad_norm": 0.6150787472724915,
      "kl": 0.37060546875,
      "learning_rate": 5.907846610890011e-07,
      "loss": 0.1074,
      "reward": 0.656824603676796,
      "reward_std": 0.7539815902709961,
      "rewards/cosine_scaled_reward": -0.025754368398338556,
      "rewards/format_reward": 0.7083333283662796,
      "step": 262
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2073.1488647460938,
      "epoch": 1.052,
      "grad_norm": 0.5915967226028442,
      "kl": 0.32958984375,
      "learning_rate": 5.87655029499542e-07,
      "loss": 0.1016,
      "reward": 0.5839189141988754,
      "reward_std": 0.6906930133700371,
      "rewards/cosine_scaled_reward": -0.10089768993202597,
      "rewards/format_reward": 0.7857142984867096,
      "step": 263
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2258.6607971191406,
      "epoch": 1.056,
      "grad_norm": 0.5032393932342529,
      "kl": 0.421875,
      "learning_rate": 5.845235626570683e-07,
      "loss": 0.0833,
      "reward": 0.7445018216967583,
      "reward_std": 0.7239043861627579,
      "rewards/cosine_scaled_reward": 0.0002271006815135479,
      "rewards/format_reward": 0.7440476417541504,
      "step": 264
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2421.386962890625,
      "epoch": 1.06,
      "grad_norm": 0.5948444604873657,
      "kl": 0.46826171875,
      "learning_rate": 5.813904131848564e-07,
      "loss": 0.1342,
      "reward": 0.3432777523994446,
      "reward_std": 0.7306928038597107,
      "rewards/cosine_scaled_reward": -0.1527658887207508,
      "rewards/format_reward": 0.6488095223903656,
      "step": 265
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1943.8214416503906,
      "epoch": 1.064,
      "grad_norm": 0.672618567943573,
      "kl": 0.33251953125,
      "learning_rate": 5.78255733788191e-07,
      "loss": 0.074,
      "reward": 0.5523176118731499,
      "reward_std": 0.6472664028406143,
      "rewards/cosine_scaled_reward": -0.08693643007427454,
      "rewards/format_reward": 0.7261904776096344,
      "step": 266
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2289.4107666015625,
      "epoch": 1.068,
      "grad_norm": 0.43480613827705383,
      "kl": 0.41064453125,
      "learning_rate": 5.751196772469237e-07,
      "loss": 0.116,
      "reward": 0.6816908866167068,
      "reward_std": 0.7700821459293365,
      "rewards/cosine_scaled_reward": -0.01034504920244217,
      "rewards/format_reward": 0.70238097012043,
      "step": 267
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2265.166748046875,
      "epoch": 1.072,
      "grad_norm": 0.8894410729408264,
      "kl": 0.37890625,
      "learning_rate": 5.71982396408026e-07,
      "loss": 0.1102,
      "reward": 0.5768959820270538,
      "reward_std": 0.7392304837703705,
      "rewards/cosine_scaled_reward": -0.04786152858287096,
      "rewards/format_reward": 0.6726190447807312,
      "step": 268
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2102.4345092773438,
      "epoch": 1.076,
      "grad_norm": 1.40628182888031,
      "kl": 0.34814453125,
      "learning_rate": 5.688440441781398e-07,
      "loss": 0.1523,
      "reward": 0.6540864631533623,
      "reward_std": 0.7483679950237274,
      "rewards/cosine_scaled_reward": -0.030099631054326892,
      "rewards/format_reward": 0.7142857164144516,
      "step": 269
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1760.5893249511719,
      "epoch": 1.08,
      "grad_norm": 0.655262291431427,
      "kl": 0.34228515625,
      "learning_rate": 5.657047735161255e-07,
      "loss": 0.0938,
      "reward": 0.7075737789273262,
      "reward_std": 0.712226152420044,
      "rewards/cosine_scaled_reward": -0.045022654812783,
      "rewards/format_reward": 0.7976190596818924,
      "step": 270
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1989.1488037109375,
      "epoch": 1.084,
      "grad_norm": 0.5984042286872864,
      "kl": 0.3974609375,
      "learning_rate": 5.625647374256061e-07,
      "loss": 0.0893,
      "reward": 0.5623346008360386,
      "reward_std": 0.7052316814661026,
      "rewards/cosine_scaled_reward": -0.08192794572096318,
      "rewards/format_reward": 0.7261904925107956,
      "step": 271
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1998.327392578125,
      "epoch": 1.088,
      "grad_norm": 0.41462650895118713,
      "kl": 0.37939453125,
      "learning_rate": 5.594240889475106e-07,
      "loss": 0.1384,
      "reward": 0.6586858294904232,
      "reward_std": 0.8071554154157639,
      "rewards/cosine_scaled_reward": -0.018871376756578684,
      "rewards/format_reward": 0.696428582072258,
      "step": 272
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1905.9880981445312,
      "epoch": 1.092,
      "grad_norm": 1.1817877292633057,
      "kl": 0.4287109375,
      "learning_rate": 5.562829811526154e-07,
      "loss": 0.108,
      "reward": 0.585694283246994,
      "reward_std": 0.6987177431583405,
      "rewards/cosine_scaled_reward": -0.08512906730175018,
      "rewards/format_reward": 0.755952388048172,
      "step": 273
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1958.0892639160156,
      "epoch": 1.096,
      "grad_norm": 0.6756201982498169,
      "kl": 0.44580078125,
      "learning_rate": 5.531415671340826e-07,
      "loss": 0.1298,
      "reward": 0.5423668641597033,
      "reward_std": 0.5766877979040146,
      "rewards/cosine_scaled_reward": -0.09786419570446014,
      "rewards/format_reward": 0.7380952537059784,
      "step": 274
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1423.4405212402344,
      "epoch": 1.1,
      "grad_norm": 0.9936150908470154,
      "kl": 0.283203125,
      "learning_rate": 5.5e-07,
      "loss": 0.0068,
      "reward": 0.8336242958903313,
      "reward_std": 0.6556554213166237,
      "rewards/cosine_scaled_reward": -0.02366404954227619,
      "rewards/format_reward": 0.8809524178504944,
      "step": 275
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1477.011962890625,
      "epoch": 1.104,
      "grad_norm": 1.4654834270477295,
      "kl": 0.30419921875,
      "learning_rate": 5.468584328659172e-07,
      "loss": 0.1583,
      "reward": 0.9086148589849472,
      "reward_std": 0.7289283871650696,
      "rewards/cosine_scaled_reward": 0.0197836235165596,
      "rewards/format_reward": 0.8690476417541504,
      "step": 276
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1600.8333740234375,
      "epoch": 1.108,
      "grad_norm": 0.5991122126579285,
      "kl": 0.39697265625,
      "learning_rate": 5.437170188473847e-07,
      "loss": 0.0615,
      "reward": 0.6998666599392891,
      "reward_std": 0.6800315380096436,
      "rewards/cosine_scaled_reward": -0.06375712971203029,
      "rewards/format_reward": 0.82738097012043,
      "step": 277
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2001.3035583496094,
      "epoch": 1.112,
      "grad_norm": 0.9033568501472473,
      "kl": 0.4404296875,
      "learning_rate": 5.405759110524894e-07,
      "loss": 0.0566,
      "reward": 0.5947119817137718,
      "reward_std": 0.6757695525884628,
      "rewards/cosine_scaled_reward": -0.0806201882660389,
      "rewards/format_reward": 0.755952388048172,
      "step": 278
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1884.9286499023438,
      "epoch": 1.116,
      "grad_norm": 1.0505043268203735,
      "kl": 0.41162109375,
      "learning_rate": 5.37435262574394e-07,
      "loss": 0.0838,
      "reward": 0.546771340072155,
      "reward_std": 0.5643983408808708,
      "rewards/cosine_scaled_reward": -0.13137624226510525,
      "rewards/format_reward": 0.8095238208770752,
      "step": 279
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1721.6309814453125,
      "epoch": 1.12,
      "grad_norm": 2.6171982288360596,
      "kl": 0.35400390625,
      "learning_rate": 5.342952264838747e-07,
      "loss": 0.119,
      "reward": 0.7959851026535034,
      "reward_std": 0.6236628741025925,
      "rewards/cosine_scaled_reward": -0.03057891083881259,
      "rewards/format_reward": 0.8571428805589676,
      "step": 280
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1974.3809814453125,
      "epoch": 1.124,
      "grad_norm": 0.9569424390792847,
      "kl": 0.4814453125,
      "learning_rate": 5.311559558218603e-07,
      "loss": 0.1494,
      "reward": 0.573462575674057,
      "reward_std": 0.6640851646661758,
      "rewards/cosine_scaled_reward": -0.09422110859304667,
      "rewards/format_reward": 0.761904776096344,
      "step": 281
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1699.2142944335938,
      "epoch": 1.1280000000000001,
      "grad_norm": 0.5432654619216919,
      "kl": 0.33935546875,
      "learning_rate": 5.28017603591974e-07,
      "loss": 0.0877,
      "reward": 0.7524446099996567,
      "reward_std": 0.6557567343115807,
      "rewards/cosine_scaled_reward": -0.04342056508176029,
      "rewards/format_reward": 0.8392857164144516,
      "step": 282
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2023.1012573242188,
      "epoch": 1.1320000000000001,
      "grad_norm": 1.5788854360580444,
      "kl": 0.498046875,
      "learning_rate": 5.248803227530763e-07,
      "loss": 0.1449,
      "reward": 0.471544723957777,
      "reward_std": 0.7016247361898422,
      "rewards/cosine_scaled_reward": -0.14220385067164898,
      "rewards/format_reward": 0.755952388048172,
      "step": 283
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1751.6190795898438,
      "epoch": 1.1360000000000001,
      "grad_norm": 0.8654693365097046,
      "kl": 0.45654296875,
      "learning_rate": 5.21744266211809e-07,
      "loss": 0.096,
      "reward": 0.8401590138673782,
      "reward_std": 0.7027324140071869,
      "rewards/cosine_scaled_reward": -0.008491916581988335,
      "rewards/format_reward": 0.8571428805589676,
      "step": 284
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1953.1309814453125,
      "epoch": 1.1400000000000001,
      "grad_norm": 0.7724223732948303,
      "kl": 0.43017578125,
      "learning_rate": 5.186095868151436e-07,
      "loss": 0.1257,
      "reward": 0.5251086875796318,
      "reward_std": 0.75553198158741,
      "rewards/cosine_scaled_reward": -0.11244566680397838,
      "rewards/format_reward": 0.7500000298023224,
      "step": 285
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1932.9940795898438,
      "epoch": 1.144,
      "grad_norm": 0.6642920970916748,
      "kl": 0.470703125,
      "learning_rate": 5.154764373429315e-07,
      "loss": 0.096,
      "reward": 0.8715938031673431,
      "reward_std": 0.7678115516901016,
      "rewards/cosine_scaled_reward": 0.036987369414418936,
      "rewards/format_reward": 0.7976190745830536,
      "step": 286
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1784.1428527832031,
      "epoch": 1.148,
      "grad_norm": 0.9823849201202393,
      "kl": 0.38134765625,
      "learning_rate": 5.123449705004581e-07,
      "loss": 0.0437,
      "reward": 0.7326274067163467,
      "reward_std": 0.6021066680550575,
      "rewards/cosine_scaled_reward": -0.044400574173778296,
      "rewards/format_reward": 0.8214285969734192,
      "step": 287
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1929.0536193847656,
      "epoch": 1.152,
      "grad_norm": 2.430745840072632,
      "kl": 0.45458984375,
      "learning_rate": 5.09215338910999e-07,
      "loss": 0.1275,
      "reward": 0.76754130423069,
      "reward_std": 0.6635829508304596,
      "rewards/cosine_scaled_reward": -0.0001579252420924604,
      "rewards/format_reward": 0.7678571492433548,
      "step": 288
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1567.4226684570312,
      "epoch": 1.156,
      "grad_norm": 1.8522855043411255,
      "kl": 0.35302734375,
      "learning_rate": 5.060876951083828e-07,
      "loss": 0.0659,
      "reward": 0.8090793639421463,
      "reward_std": 0.6970714181661606,
      "rewards/cosine_scaled_reward": -0.02105556521564722,
      "rewards/format_reward": 0.8511905074119568,
      "step": 289
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1916.0298156738281,
      "epoch": 1.16,
      "grad_norm": 0.8320524096488953,
      "kl": 0.353515625,
      "learning_rate": 5.02962191529556e-07,
      "loss": 0.0397,
      "reward": 0.8147249445319176,
      "reward_std": 0.7559010833501816,
      "rewards/cosine_scaled_reward": 0.014505308354273438,
      "rewards/format_reward": 0.7857143133878708,
      "step": 290
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1871.5595703125,
      "epoch": 1.164,
      "grad_norm": 1.639461636543274,
      "kl": 0.44482421875,
      "learning_rate": 4.998389805071536e-07,
      "loss": 0.02,
      "reward": 0.7966814041137695,
      "reward_std": 0.6868171393871307,
      "rewards/cosine_scaled_reward": -0.03320692107081413,
      "rewards/format_reward": 0.8630952686071396,
      "step": 291
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1849.0714721679688,
      "epoch": 1.168,
      "grad_norm": 0.9159106016159058,
      "kl": 0.41357421875,
      "learning_rate": 4.967182142620745e-07,
      "loss": 0.1098,
      "reward": 0.8123535662889481,
      "reward_std": 0.7406510710716248,
      "rewards/cosine_scaled_reward": -0.0075137000530958176,
      "rewards/format_reward": 0.82738097012043,
      "step": 292
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1627.2262268066406,
      "epoch": 1.172,
      "grad_norm": 1.2907826900482178,
      "kl": 0.3271484375,
      "learning_rate": 4.93600044896063e-07,
      "loss": 0.028,
      "reward": 0.7378726750612259,
      "reward_std": 0.6904594451189041,
      "rewards/cosine_scaled_reward": -0.07153987139463425,
      "rewards/format_reward": 0.8809524029493332,
      "step": 293
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2141.2202758789062,
      "epoch": 1.176,
      "grad_norm": 0.7737708687782288,
      "kl": 0.4482421875,
      "learning_rate": 4.904846243842949e-07,
      "loss": 0.0644,
      "reward": 0.7625293210148811,
      "reward_std": 0.7152971476316452,
      "rewards/cosine_scaled_reward": 0.009240844286978245,
      "rewards/format_reward": 0.7440476417541504,
      "step": 294
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2187.3809814453125,
      "epoch": 1.18,
      "grad_norm": 1.1525542736053467,
      "kl": 0.51025390625,
      "learning_rate": 4.873721045679706e-07,
      "loss": 0.0634,
      "reward": 0.5901899486780167,
      "reward_std": 0.6728092133998871,
      "rewards/cosine_scaled_reward": -0.0739526596153155,
      "rewards/format_reward": 0.7380952537059784,
      "step": 295
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2388.577392578125,
      "epoch": 1.184,
      "grad_norm": 0.9084761738777161,
      "kl": 0.52734375,
      "learning_rate": 4.842626371469149e-07,
      "loss": 0.0587,
      "reward": 0.4302752036601305,
      "reward_std": 0.615352213382721,
      "rewards/cosine_scaled_reward": -0.12117192603182048,
      "rewards/format_reward": 0.6726190596818924,
      "step": 296
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1568.6786193847656,
      "epoch": 1.188,
      "grad_norm": 0.852024495601654,
      "kl": 0.1533203125,
      "learning_rate": 4.811563736721829e-07,
      "loss": 0.0574,
      "reward": 0.7380149587988853,
      "reward_std": 0.7155523598194122,
      "rewards/cosine_scaled_reward": -0.029802043922245502,
      "rewards/format_reward": 0.7976190596818924,
      "step": 297
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1983.4524230957031,
      "epoch": 1.192,
      "grad_norm": 0.7617373466491699,
      "kl": 0.303955078125,
      "learning_rate": 4.780534655386743e-07,
      "loss": 0.068,
      "reward": 0.7127486318349838,
      "reward_std": 0.7076264545321465,
      "rewards/cosine_scaled_reward": -0.018625682685524225,
      "rewards/format_reward": 0.7500000149011612,
      "step": 298
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2038.3750305175781,
      "epoch": 1.196,
      "grad_norm": 0.8094474673271179,
      "kl": 0.25830078125,
      "learning_rate": 4.749540639777539e-07,
      "loss": 0.0566,
      "reward": 0.6301854252815247,
      "reward_std": 0.6336864829063416,
      "rewards/cosine_scaled_reward": -0.036097751930356026,
      "rewards/format_reward": 0.7023809552192688,
      "step": 299
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1825.9643249511719,
      "epoch": 1.2,
      "grad_norm": 1.8039993047714233,
      "kl": 0.225830078125,
      "learning_rate": 4.7185832004988133e-07,
      "loss": 0.0501,
      "reward": 0.9151953011751175,
      "reward_std": 0.6518659368157387,
      "rewards/cosine_scaled_reward": 0.03200240898877382,
      "rewards/format_reward": 0.8511905074119568,
      "step": 300
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1954.3809814453125,
      "epoch": 1.204,
      "grad_norm": 0.9098180532455444,
      "kl": 0.2685546875,
      "learning_rate": 4.68766384637248e-07,
      "loss": 0.08,
      "reward": 0.903901144862175,
      "reward_std": 0.7074443101882935,
      "rewards/cosine_scaled_reward": 0.059093400835990906,
      "rewards/format_reward": 0.7857142835855484,
      "step": 301
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2221.6012268066406,
      "epoch": 1.208,
      "grad_norm": 0.628447949886322,
      "kl": 0.297119140625,
      "learning_rate": 4.656784084364238e-07,
      "loss": 0.0225,
      "reward": 0.7435066364705563,
      "reward_std": 0.7286128550767899,
      "rewards/cosine_scaled_reward": 0.002705696038901806,
      "rewards/format_reward": 0.7380952537059784,
      "step": 302
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1967.6607360839844,
      "epoch": 1.212,
      "grad_norm": 1.4870760440826416,
      "kl": 0.2193603515625,
      "learning_rate": 4.6259454195101267e-07,
      "loss": 0.0076,
      "reward": 0.6118638888001442,
      "reward_std": 0.6256552934646606,
      "rewards/cosine_scaled_reward": -0.08990138117223978,
      "rewards/format_reward": 0.7916666716337204,
      "step": 303
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2091.8810119628906,
      "epoch": 1.216,
      "grad_norm": 1.0213916301727295,
      "kl": 0.28857421875,
      "learning_rate": 4.59514935484316e-07,
      "loss": 0.0819,
      "reward": 0.8858746439218521,
      "reward_std": 0.760543704032898,
      "rewards/cosine_scaled_reward": 0.04412779211997986,
      "rewards/format_reward": 0.7976190596818924,
      "step": 304
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2117.5357971191406,
      "epoch": 1.22,
      "grad_norm": 1.1696289777755737,
      "kl": 0.266845703125,
      "learning_rate": 4.5643973913200837e-07,
      "loss": 0.0319,
      "reward": 0.4878672659397125,
      "reward_std": 0.5883132815361023,
      "rewards/cosine_scaled_reward": -0.13999494537711143,
      "rewards/format_reward": 0.7678571492433548,
      "step": 305
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1962.8512268066406,
      "epoch": 1.224,
      "grad_norm": 1.1181604862213135,
      "kl": 0.26171875,
      "learning_rate": 4.5336910277482155e-07,
      "loss": 0.0553,
      "reward": 0.8040256127715111,
      "reward_std": 0.7542890757322311,
      "rewards/cosine_scaled_reward": 0.00915566342882812,
      "rewards/format_reward": 0.7857143059372902,
      "step": 306
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2503.541717529297,
      "epoch": 1.228,
      "grad_norm": 1.0075181722640991,
      "kl": 0.274658203125,
      "learning_rate": 4.503031760712397e-07,
      "loss": 0.0639,
      "reward": 0.5502185635268688,
      "reward_std": 0.7036140263080597,
      "rewards/cosine_scaled_reward": -0.043343101628124714,
      "rewards/format_reward": 0.6369047686457634,
      "step": 307
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2027.6905212402344,
      "epoch": 1.232,
      "grad_norm": 2.7786951065063477,
      "kl": 0.265380859375,
      "learning_rate": 4.4724210845020494e-07,
      "loss": 0.1529,
      "reward": 0.8017951250076294,
      "reward_std": 0.7912951856851578,
      "rewards/cosine_scaled_reward": 0.005064212018623948,
      "rewards/format_reward": 0.7916666865348816,
      "step": 308
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2560.6845703125,
      "epoch": 1.236,
      "grad_norm": 1.6693713665008545,
      "kl": 0.2939453125,
      "learning_rate": 4.441860491038345e-07,
      "loss": 0.1046,
      "reward": 0.6068699322640896,
      "reward_std": 0.7445466667413712,
      "rewards/cosine_scaled_reward": -0.00013647368177771568,
      "rewards/format_reward": 0.6071428656578064,
      "step": 309
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2221.3988037109375,
      "epoch": 1.24,
      "grad_norm": 0.7167072892189026,
      "kl": 0.253173828125,
      "learning_rate": 4.4113514698014953e-07,
      "loss": 0.046,
      "reward": 0.5108997635543346,
      "reward_std": 0.6983606815338135,
      "rewards/cosine_scaled_reward": -0.09276440553367138,
      "rewards/format_reward": 0.696428582072258,
      "step": 310
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2308.1012573242188,
      "epoch": 1.244,
      "grad_norm": 1.289093255996704,
      "kl": 0.24072265625,
      "learning_rate": 4.3808955077581546e-07,
      "loss": 0.074,
      "reward": 0.49418094009160995,
      "reward_std": 0.6803844273090363,
      "rewards/cosine_scaled_reward": -0.0862428704276681,
      "rewards/format_reward": 0.6666666716337204,
      "step": 311
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2221.500030517578,
      "epoch": 1.248,
      "grad_norm": 0.7747544646263123,
      "kl": 0.284423828125,
      "learning_rate": 4.350494089288943e-07,
      "loss": 0.0127,
      "reward": 0.5928547494113445,
      "reward_std": 0.6995180547237396,
      "rewards/cosine_scaled_reward": -0.0875012082979083,
      "rewards/format_reward": 0.7678571492433548,
      "step": 312
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2528.3750610351562,
      "epoch": 1.252,
      "grad_norm": 0.9067274928092957,
      "kl": 0.261962890625,
      "learning_rate": 4.3201486961161093e-07,
      "loss": 0.0588,
      "reward": 0.580617468804121,
      "reward_std": 0.7565959244966507,
      "rewards/cosine_scaled_reward": -0.05195318069308996,
      "rewards/format_reward": 0.6845238208770752,
      "step": 313
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2290.434539794922,
      "epoch": 1.256,
      "grad_norm": 1.0397149324417114,
      "kl": 0.2939453125,
      "learning_rate": 4.2898608072313045e-07,
      "loss": 0.0843,
      "reward": 0.923637330532074,
      "reward_std": 0.8029050081968307,
      "rewards/cosine_scaled_reward": 0.07491390081122518,
      "rewards/format_reward": 0.7738095372915268,
      "step": 314
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2221.1785583496094,
      "epoch": 1.26,
      "grad_norm": 0.8451793789863586,
      "kl": 0.2978515625,
      "learning_rate": 4.2596318988235037e-07,
      "loss": 0.0794,
      "reward": 0.9175606220960617,
      "reward_std": 0.6950835883617401,
      "rewards/cosine_scaled_reward": 0.06592314876616001,
      "rewards/format_reward": 0.7857142984867096,
      "step": 315
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2737.1131591796875,
      "epoch": 1.264,
      "grad_norm": 0.8914613723754883,
      "kl": 0.4072265625,
      "learning_rate": 4.2294634442070553e-07,
      "loss": 0.0266,
      "reward": 0.39799112919718027,
      "reward_std": 0.5211281925439835,
      "rewards/cosine_scaled_reward": -0.0896949004381895,
      "rewards/format_reward": 0.5773809626698494,
      "step": 316
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2356.8155517578125,
      "epoch": 1.268,
      "grad_norm": 0.8658885955810547,
      "kl": 0.32080078125,
      "learning_rate": 4.1993569137498776e-07,
      "loss": 0.0558,
      "reward": 0.4528093598783016,
      "reward_std": 0.5718662440776825,
      "rewards/cosine_scaled_reward": -0.11585722491145134,
      "rewards/format_reward": 0.6845238283276558,
      "step": 317
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2067.0952758789062,
      "epoch": 1.272,
      "grad_norm": 0.6174459457397461,
      "kl": 0.2724609375,
      "learning_rate": 4.1693137748017915e-07,
      "loss": 0.0532,
      "reward": 0.9527914822101593,
      "reward_std": 0.7573249191045761,
      "rewards/cosine_scaled_reward": 0.059729063883423805,
      "rewards/format_reward": 0.833333358168602,
      "step": 318
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2158.3631286621094,
      "epoch": 1.276,
      "grad_norm": 0.5749858617782593,
      "kl": 0.2744140625,
      "learning_rate": 4.1393354916230005e-07,
      "loss": 0.0398,
      "reward": 0.7759583368897438,
      "reward_std": 0.7076128423213959,
      "rewards/cosine_scaled_reward": 0.00702677620574832,
      "rewards/format_reward": 0.761904776096344,
      "step": 319
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2376.7857666015625,
      "epoch": 1.28,
      "grad_norm": 0.4824450612068176,
      "kl": 0.358642578125,
      "learning_rate": 4.1094235253127374e-07,
      "loss": 0.0579,
      "reward": 0.5863704346120358,
      "reward_std": 0.69185970723629,
      "rewards/cosine_scaled_reward": -0.058005278930068016,
      "rewards/format_reward": 0.7023809552192688,
      "step": 320
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2094.6786193847656,
      "epoch": 1.284,
      "grad_norm": 1.153307318687439,
      "kl": 0.32373046875,
      "learning_rate": 4.079579333738039e-07,
      "loss": 0.0147,
      "reward": 0.5667938031256199,
      "reward_std": 0.6206858605146408,
      "rewards/cosine_scaled_reward": -0.12434119766112417,
      "rewards/format_reward": 0.8154762089252472,
      "step": 321
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2360.970245361328,
      "epoch": 1.288,
      "grad_norm": 0.7556703090667725,
      "kl": 0.314697265625,
      "learning_rate": 4.0498043714627006e-07,
      "loss": 0.0413,
      "reward": 0.54334956407547,
      "reward_std": 0.7112371101975441,
      "rewards/cosine_scaled_reward": -0.10927761369384825,
      "rewards/format_reward": 0.761904776096344,
      "step": 322
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2466.9703369140625,
      "epoch": 1.292,
      "grad_norm": 0.6241899728775024,
      "kl": 0.345703125,
      "learning_rate": 4.020100089676376e-07,
      "loss": 0.0463,
      "reward": 0.5620089694857597,
      "reward_std": 0.6381285488605499,
      "rewards/cosine_scaled_reward": -0.0672098146751523,
      "rewards/format_reward": 0.696428582072258,
      "step": 323
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2297.2916564941406,
      "epoch": 1.296,
      "grad_norm": 1.050784945487976,
      "kl": 0.289306640625,
      "learning_rate": 3.9904679361238526e-07,
      "loss": 0.095,
      "reward": 0.6569867879152298,
      "reward_std": 0.6581598520278931,
      "rewards/cosine_scaled_reward": -0.034601859748363495,
      "rewards/format_reward": 0.7261904925107956,
      "step": 324
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2391.8095703125,
      "epoch": 1.3,
      "grad_norm": 0.5910518169403076,
      "kl": 0.327392578125,
      "learning_rate": 3.9609093550344907e-07,
      "loss": 0.065,
      "reward": 0.6689947620034218,
      "reward_std": 0.5862837731838226,
      "rewards/cosine_scaled_reward": -0.0434788279235363,
      "rewards/format_reward": 0.7559524029493332,
      "step": 325
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2161.34521484375,
      "epoch": 1.304,
      "grad_norm": 1.3934383392333984,
      "kl": 0.2412109375,
      "learning_rate": 3.931425787051832e-07,
      "loss": 0.0952,
      "reward": 0.7927189618349075,
      "reward_std": 0.8861154615879059,
      "rewards/cosine_scaled_reward": 0.03624042624142021,
      "rewards/format_reward": 0.7202381044626236,
      "step": 326
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2205.75,
      "epoch": 1.308,
      "grad_norm": 0.5909004211425781,
      "kl": 0.276611328125,
      "learning_rate": 3.902018669163384e-07,
      "loss": 0.0265,
      "reward": 0.7868844717741013,
      "reward_std": 0.6631656885147095,
      "rewards/cosine_scaled_reward": 0.024394613516051322,
      "rewards/format_reward": 0.7380952388048172,
      "step": 327
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2526.71435546875,
      "epoch": 1.312,
      "grad_norm": 0.37658610939979553,
      "kl": 0.30908203125,
      "learning_rate": 3.872689434630585e-07,
      "loss": 0.0593,
      "reward": 0.3922804482281208,
      "reward_std": 0.7164648473262787,
      "rewards/cosine_scaled_reward": -0.11933596897870302,
      "rewards/format_reward": 0.6309523731470108,
      "step": 328
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2406.6726684570312,
      "epoch": 1.316,
      "grad_norm": 0.5439748764038086,
      "kl": 0.28759765625,
      "learning_rate": 3.843439512918949e-07,
      "loss": 0.0395,
      "reward": 0.457830130122602,
      "reward_std": 0.6897861212491989,
      "rewards/cosine_scaled_reward": -0.10739446245133877,
      "rewards/format_reward": 0.6726190447807312,
      "step": 329
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2233.6548461914062,
      "epoch": 1.32,
      "grad_norm": 1.2243571281433105,
      "kl": 0.289306640625,
      "learning_rate": 3.8142703296283953e-07,
      "loss": 0.1087,
      "reward": 0.6516863703727722,
      "reward_std": 0.7036527991294861,
      "rewards/cosine_scaled_reward": -0.08189492486417294,
      "rewards/format_reward": 0.8154762089252472,
      "step": 330
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2212.7560119628906,
      "epoch": 1.324,
      "grad_norm": 0.8144615888595581,
      "kl": 0.28857421875,
      "learning_rate": 3.785183306423767e-07,
      "loss": 0.0775,
      "reward": 0.5815620422363281,
      "reward_std": 0.5177476480603218,
      "rewards/cosine_scaled_reward": -0.042552310740575194,
      "rewards/format_reward": 0.6666666865348816,
      "step": 331
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2412.0059814453125,
      "epoch": 1.328,
      "grad_norm": 0.42855292558670044,
      "kl": 0.3232421875,
      "learning_rate": 3.7561798609655373e-07,
      "loss": 0.0791,
      "reward": 0.642042949795723,
      "reward_std": 0.6289803832769394,
      "rewards/cosine_scaled_reward": -0.04207377042621374,
      "rewards/format_reward": 0.7261904925107956,
      "step": 332
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2163.9822387695312,
      "epoch": 1.332,
      "grad_norm": 1.0114275217056274,
      "kl": 0.255859375,
      "learning_rate": 3.72726140684072e-07,
      "loss": 0.088,
      "reward": 0.811268161451153,
      "reward_std": 0.6822613030672073,
      "rewards/cosine_scaled_reward": 0.042538831010460854,
      "rewards/format_reward": 0.7261904925107956,
      "step": 333
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2329.0178833007812,
      "epoch": 1.336,
      "grad_norm": 0.7170870900154114,
      "kl": 0.3486328125,
      "learning_rate": 3.6984293534939737e-07,
      "loss": 0.0455,
      "reward": 0.8848401606082916,
      "reward_std": 0.7328508943319321,
      "rewards/cosine_scaled_reward": 0.04361054569017142,
      "rewards/format_reward": 0.7976190745830536,
      "step": 334
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2709.2262573242188,
      "epoch": 1.34,
      "grad_norm": 0.47293010354042053,
      "kl": 0.38037109375,
      "learning_rate": 3.6696851061588994e-07,
      "loss": 0.0416,
      "reward": 0.3898888286203146,
      "reward_std": 0.6401937156915665,
      "rewards/cosine_scaled_reward": -0.10862701199948788,
      "rewards/format_reward": 0.6071428582072258,
      "step": 335
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2530.416748046875,
      "epoch": 1.3439999999999999,
      "grad_norm": 0.4423607885837555,
      "kl": 0.282958984375,
      "learning_rate": 3.641030065789562e-07,
      "loss": 0.0446,
      "reward": 0.6725399196147919,
      "reward_std": 0.7871751934289932,
      "rewards/cosine_scaled_reward": 0.0059128133580088615,
      "rewards/format_reward": 0.6607142984867096,
      "step": 336
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2311.5358276367188,
      "epoch": 1.3479999999999999,
      "grad_norm": 0.5007253885269165,
      "kl": 0.3203125,
      "learning_rate": 3.612465628992203e-07,
      "loss": 0.0455,
      "reward": 0.8073793947696686,
      "reward_std": 0.7870100140571594,
      "rewards/cosine_scaled_reward": 0.010832530329935253,
      "rewards/format_reward": 0.7857142984867096,
      "step": 337
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2489.7500610351562,
      "epoch": 1.3519999999999999,
      "grad_norm": 0.36444640159606934,
      "kl": 0.305908203125,
      "learning_rate": 3.5839931879571725e-07,
      "loss": 0.0652,
      "reward": 0.6751855611801147,
      "reward_std": 0.6701688021421432,
      "rewards/cosine_scaled_reward": 0.001283254474401474,
      "rewards/format_reward": 0.6726190596818924,
      "step": 338
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2460.8154907226562,
      "epoch": 1.3559999999999999,
      "grad_norm": 0.43892228603363037,
      "kl": 0.3369140625,
      "learning_rate": 3.555614130391079e-07,
      "loss": 0.0519,
      "reward": 0.6638183146715164,
      "reward_std": 0.770327016711235,
      "rewards/cosine_scaled_reward": 0.010480590397492051,
      "rewards/format_reward": 0.6428571566939354,
      "step": 339
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2244.3631896972656,
      "epoch": 1.3599999999999999,
      "grad_norm": 0.6102768778800964,
      "kl": 0.31201171875,
      "learning_rate": 3.5273298394491515e-07,
      "loss": 0.0694,
      "reward": 0.8422182202339172,
      "reward_std": 0.6671302318572998,
      "rewards/cosine_scaled_reward": 0.04610910080373287,
      "rewards/format_reward": 0.7500000074505806,
      "step": 340
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2239.75,
      "epoch": 1.3639999999999999,
      "grad_norm": 0.6582260727882385,
      "kl": 0.3271484375,
      "learning_rate": 3.4991416936678276e-07,
      "loss": 0.076,
      "reward": 0.6709855943918228,
      "reward_std": 0.7041856721043587,
      "rewards/cosine_scaled_reward": -0.03355482150800526,
      "rewards/format_reward": 0.7380952537059784,
      "step": 341
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2438.3214721679688,
      "epoch": 1.3679999999999999,
      "grad_norm": 0.5521511435508728,
      "kl": 0.320556640625,
      "learning_rate": 3.471051066897562e-07,
      "loss": 0.047,
      "reward": 0.6942454129457474,
      "reward_std": 0.6340186148881912,
      "rewards/cosine_scaled_reward": 0.010813180379045662,
      "rewards/format_reward": 0.6726190745830536,
      "step": 342
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2433.6488647460938,
      "epoch": 1.3719999999999999,
      "grad_norm": 0.928674042224884,
      "kl": 0.40478515625,
      "learning_rate": 3.4430593282358777e-07,
      "loss": 0.0328,
      "reward": 0.5231252759695053,
      "reward_std": 0.7485495656728745,
      "rewards/cosine_scaled_reward": -0.11641356535255909,
      "rewards/format_reward": 0.755952388048172,
      "step": 343
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2388.2202758789062,
      "epoch": 1.376,
      "grad_norm": 0.43529966473579407,
      "kl": 0.32080078125,
      "learning_rate": 3.4151678419606233e-07,
      "loss": 0.0303,
      "reward": 0.7449862584471703,
      "reward_std": 0.6971839666366577,
      "rewards/cosine_scaled_reward": 0.024278827477246523,
      "rewards/format_reward": 0.696428582072258,
      "step": 344
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2686.15478515625,
      "epoch": 1.38,
      "grad_norm": 0.5191164016723633,
      "kl": 0.36328125,
      "learning_rate": 3.387377967463493e-07,
      "loss": 0.0602,
      "reward": 0.4467791821807623,
      "reward_std": 0.6689166128635406,
      "rewards/cosine_scaled_reward": -0.07125327130779624,
      "rewards/format_reward": 0.5892857238650322,
      "step": 345
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2138.119110107422,
      "epoch": 1.384,
      "grad_norm": 0.40859875082969666,
      "kl": 0.344970703125,
      "learning_rate": 3.359691059183761e-07,
      "loss": 0.0894,
      "reward": 0.7263324186205864,
      "reward_std": 0.7082626074552536,
      "rewards/cosine_scaled_reward": -0.029690947383642197,
      "rewards/format_reward": 0.7857143133878708,
      "step": 346
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2158.6429138183594,
      "epoch": 1.388,
      "grad_norm": 0.35558465123176575,
      "kl": 0.29638671875,
      "learning_rate": 3.3321084665422803e-07,
      "loss": 0.0262,
      "reward": 0.6269577667117119,
      "reward_std": 0.5908889323472977,
      "rewards/cosine_scaled_reward": -0.04664018237963319,
      "rewards/format_reward": 0.7202381119132042,
      "step": 347
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2144.619110107422,
      "epoch": 1.392,
      "grad_norm": 1.211071491241455,
      "kl": 0.306640625,
      "learning_rate": 3.3046315338757026e-07,
      "loss": -0.0105,
      "reward": 0.6653935462236404,
      "reward_std": 0.6245283707976341,
      "rewards/cosine_scaled_reward": -0.04230323247611523,
      "rewards/format_reward": 0.75,
      "step": 348
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2366.4940795898438,
      "epoch": 1.396,
      "grad_norm": 0.5814414620399475,
      "kl": 0.33154296875,
      "learning_rate": 3.2772616003709616e-07,
      "loss": 0.0485,
      "reward": 0.5602632537484169,
      "reward_std": 0.5761818215250969,
      "rewards/cosine_scaled_reward": -0.0978445541113615,
      "rewards/format_reward": 0.755952388048172,
      "step": 349
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2348.6607666015625,
      "epoch": 1.4,
      "grad_norm": 0.675369918346405,
      "kl": 0.29931640625,
      "learning_rate": 3.250000000000001e-07,
      "loss": 0.0825,
      "reward": 0.475093599408865,
      "reward_std": 0.604865163564682,
      "rewards/cosine_scaled_reward": -0.07792939431965351,
      "rewards/format_reward": 0.6309523731470108,
      "step": 350
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2099.279815673828,
      "epoch": 1.404,
      "grad_norm": 0.5227596163749695,
      "kl": 0.33447265625,
      "learning_rate": 3.222848061454764e-07,
      "loss": 0.0454,
      "reward": 0.6502892896533012,
      "reward_std": 0.676431730389595,
      "rewards/cosine_scaled_reward": -0.05878393305465579,
      "rewards/format_reward": 0.7678571492433548,
      "step": 351
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2465.202392578125,
      "epoch": 1.408,
      "grad_norm": 0.4936739206314087,
      "kl": 0.33154296875,
      "learning_rate": 3.195807108082429e-07,
      "loss": 0.0349,
      "reward": 0.51472207903862,
      "reward_std": 0.6474315822124481,
      "rewards/cosine_scaled_reward": -0.05216278973966837,
      "rewards/format_reward": 0.6190476417541504,
      "step": 352
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2241.089324951172,
      "epoch": 1.412,
      "grad_norm": 0.4653976857662201,
      "kl": 0.3046875,
      "learning_rate": 3.168878457820915e-07,
      "loss": 0.0576,
      "reward": 0.7246856689453125,
      "reward_std": 0.7023278325796127,
      "rewards/cosine_scaled_reward": -0.02456192229874432,
      "rewards/format_reward": 0.7738095223903656,
      "step": 353
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2174.4107666015625,
      "epoch": 1.416,
      "grad_norm": 1.179158091545105,
      "kl": 0.31982421875,
      "learning_rate": 3.142063423134644e-07,
      "loss": 0.1321,
      "reward": 0.4120100736618042,
      "reward_std": 0.5803252756595612,
      "rewards/cosine_scaled_reward": -0.19280448742210865,
      "rewards/format_reward": 0.7976190596818924,
      "step": 354
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2560.9405517578125,
      "epoch": 1.42,
      "grad_norm": 0.6409890651702881,
      "kl": 0.3291015625,
      "learning_rate": 3.115363310950578e-07,
      "loss": 0.0637,
      "reward": 0.6557277590036392,
      "reward_std": 0.8805683702230453,
      "rewards/cosine_scaled_reward": -0.02332661801483482,
      "rewards/format_reward": 0.70238097012043,
      "step": 355
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1872.1012573242188,
      "epoch": 1.424,
      "grad_norm": 0.4570577144622803,
      "kl": 0.244873046875,
      "learning_rate": 3.0887794225945143e-07,
      "loss": 0.0537,
      "reward": 0.8301898017525673,
      "reward_std": 0.6987727582454681,
      "rewards/cosine_scaled_reward": -0.016452712705358863,
      "rewards/format_reward": 0.8630952537059784,
      "step": 356
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2199.4880981445312,
      "epoch": 1.428,
      "grad_norm": 0.5453688502311707,
      "kl": 0.3388671875,
      "learning_rate": 3.062313053727671e-07,
      "loss": 0.1004,
      "reward": 0.5429714322090149,
      "reward_std": 0.757801964879036,
      "rewards/cosine_scaled_reward": -0.11244285944849253,
      "rewards/format_reward": 0.7678571492433548,
      "step": 357
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2392.5536193847656,
      "epoch": 1.432,
      "grad_norm": 0.4179025888442993,
      "kl": 0.36767578125,
      "learning_rate": 3.0359654942835247e-07,
      "loss": 0.057,
      "reward": 0.6754717975854874,
      "reward_std": 0.8176562935113907,
      "rewards/cosine_scaled_reward": -0.004526023752987385,
      "rewards/format_reward": 0.6845238357782364,
      "step": 358
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2280.5000915527344,
      "epoch": 1.436,
      "grad_norm": 0.5272053480148315,
      "kl": 0.273681640625,
      "learning_rate": 3.0097380284049523e-07,
      "loss": 0.0565,
      "reward": 0.644446611404419,
      "reward_std": 0.7567472010850906,
      "rewards/cosine_scaled_reward": -0.02896718680858612,
      "rewards/format_reward": 0.70238097012043,
      "step": 359
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2270.3928833007812,
      "epoch": 1.44,
      "grad_norm": 0.8152810335159302,
      "kl": 0.34619140625,
      "learning_rate": 2.9836319343816397e-07,
      "loss": 0.0306,
      "reward": 0.7786325067281723,
      "reward_std": 0.559767447412014,
      "rewards/cosine_scaled_reward": -0.012469482608139515,
      "rewards/format_reward": 0.8035714477300644,
      "step": 360
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2094.2083740234375,
      "epoch": 1.444,
      "grad_norm": 0.9731494188308716,
      "kl": 0.33203125,
      "learning_rate": 2.9576484845877793e-07,
      "loss": 0.0315,
      "reward": 0.7239094823598862,
      "reward_std": 0.6780030280351639,
      "rewards/cosine_scaled_reward": -0.057688117027282715,
      "rewards/format_reward": 0.839285746216774,
      "step": 361
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2515.3809814453125,
      "epoch": 1.448,
      "grad_norm": 0.5006127953529358,
      "kl": 0.3583984375,
      "learning_rate": 2.931788945420058e-07,
      "loss": 0.0632,
      "reward": 0.5585716450586915,
      "reward_std": 0.6955743506550789,
      "rewards/cosine_scaled_reward": -0.08976180851459503,
      "rewards/format_reward": 0.7380952462553978,
      "step": 362
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2665.2560424804688,
      "epoch": 1.452,
      "grad_norm": 0.4868517220020294,
      "kl": 0.373046875,
      "learning_rate": 2.9060545772359305e-07,
      "loss": 0.0555,
      "reward": 0.5607914663851261,
      "reward_std": 0.6483574956655502,
      "rewards/cosine_scaled_reward": -0.07377092959359288,
      "rewards/format_reward": 0.7083333432674408,
      "step": 363
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2244.7262573242188,
      "epoch": 1.456,
      "grad_norm": 0.6844132542610168,
      "kl": 0.3173828125,
      "learning_rate": 2.8804466342921987e-07,
      "loss": 0.0109,
      "reward": 0.7073798812925816,
      "reward_std": 0.6621369272470474,
      "rewards/cosine_scaled_reward": -0.01833386719226837,
      "rewards/format_reward": 0.7440476417541504,
      "step": 364
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2576.1905517578125,
      "epoch": 1.46,
      "grad_norm": 0.5755227208137512,
      "kl": 0.35400390625,
      "learning_rate": 2.854966364683872e-07,
      "loss": 0.0531,
      "reward": 0.6706622801721096,
      "reward_std": 0.8000525310635567,
      "rewards/cosine_scaled_reward": -0.03371649980545044,
      "rewards/format_reward": 0.7380952537059784,
      "step": 365
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2664.3928833007812,
      "epoch": 1.464,
      "grad_norm": 0.6695978045463562,
      "kl": 0.4052734375,
      "learning_rate": 2.829615010283344e-07,
      "loss": 0.1001,
      "reward": 0.6332942470908165,
      "reward_std": 0.9363250732421875,
      "rewards/cosine_scaled_reward": -0.04049574676901102,
      "rewards/format_reward": 0.7142857313156128,
      "step": 366
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2493.2857666015625,
      "epoch": 1.468,
      "grad_norm": 0.41825661063194275,
      "kl": 0.269775390625,
      "learning_rate": 2.8043938066798645e-07,
      "loss": 0.0634,
      "reward": 0.6000736728310585,
      "reward_std": 0.6958686709403992,
      "rewards/cosine_scaled_reward": -0.04520127363502979,
      "rewards/format_reward": 0.690476194024086,
      "step": 367
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2441.184600830078,
      "epoch": 1.472,
      "grad_norm": 0.6742368936538696,
      "kl": 0.29248046875,
      "learning_rate": 2.7793039831193133e-07,
      "loss": 0.0205,
      "reward": 0.7077510952949524,
      "reward_std": 0.8173489719629288,
      "rewards/cosine_scaled_reward": -0.003267320804297924,
      "rewards/format_reward": 0.7142857313156128,
      "step": 368
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2645.202392578125,
      "epoch": 1.476,
      "grad_norm": 0.6914957761764526,
      "kl": 0.298095703125,
      "learning_rate": 2.7543467624442956e-07,
      "loss": 0.0967,
      "reward": 0.2303389220032841,
      "reward_std": 0.6355866640806198,
      "rewards/cosine_scaled_reward": -0.1616162583231926,
      "rewards/format_reward": 0.5535714328289032,
      "step": 369
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2256.9286193847656,
      "epoch": 1.48,
      "grad_norm": 0.9714637994766235,
      "kl": 0.255126953125,
      "learning_rate": 2.729523361034538e-07,
      "loss": 0.0866,
      "reward": 0.7436040937900543,
      "reward_std": 0.6377575844526291,
      "rewards/cosine_scaled_reward": -0.012126525864005089,
      "rewards/format_reward": 0.767857164144516,
      "step": 370
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2519.7202758789062,
      "epoch": 1.484,
      "grad_norm": 0.6541756987571716,
      "kl": 0.32470703125,
      "learning_rate": 2.7048349887476037e-07,
      "loss": 0.0731,
      "reward": 0.8480066582560539,
      "reward_std": 0.7711106240749359,
      "rewards/cosine_scaled_reward": 0.031146179419010878,
      "rewards/format_reward": 0.7857142984867096,
      "step": 371
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2420.970245361328,
      "epoch": 1.488,
      "grad_norm": 0.5346278548240662,
      "kl": 0.2998046875,
      "learning_rate": 2.6802828488599294e-07,
      "loss": 0.0556,
      "reward": 0.6287192776799202,
      "reward_std": 0.6931318640708923,
      "rewards/cosine_scaled_reward": -0.03683085576631129,
      "rewards/format_reward": 0.7023809552192688,
      "step": 372
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2542.5654907226562,
      "epoch": 1.492,
      "grad_norm": 0.43199771642684937,
      "kl": 0.33544921875,
      "learning_rate": 2.655868138008171e-07,
      "loss": 0.0657,
      "reward": 0.4730634540319443,
      "reward_std": 0.5836888402700424,
      "rewards/cosine_scaled_reward": -0.1533492412418127,
      "rewards/format_reward": 0.7797619104385376,
      "step": 373
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2848.3572387695312,
      "epoch": 1.496,
      "grad_norm": 0.6630088686943054,
      "kl": 0.35009765625,
      "learning_rate": 2.631592046130896e-07,
      "loss": 0.0207,
      "reward": 0.2956889607012272,
      "reward_std": 0.614417277276516,
      "rewards/cosine_scaled_reward": -0.1319174226373434,
      "rewards/format_reward": 0.5595238283276558,
      "step": 374
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2754.0060424804688,
      "epoch": 1.5,
      "grad_norm": 0.4504316449165344,
      "kl": 0.302490234375,
      "learning_rate": 2.6074557564105724e-07,
      "loss": 0.0225,
      "reward": 0.42709287256002426,
      "reward_std": 0.6112170070409775,
      "rewards/cosine_scaled_reward": -0.07514405064284801,
      "rewards/format_reward": 0.5773809552192688,
      "step": 375
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2403.52978515625,
      "epoch": 1.504,
      "grad_norm": 0.4335888624191284,
      "kl": 0.266845703125,
      "learning_rate": 2.583460445215911e-07,
      "loss": 0.0347,
      "reward": 0.37878482323139906,
      "reward_std": 0.5512942001223564,
      "rewards/cosine_scaled_reward": -0.14691711403429508,
      "rewards/format_reward": 0.6726190745830536,
      "step": 376
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2539.71435546875,
      "epoch": 1.508,
      "grad_norm": 0.6600142121315002,
      "kl": 0.35546875,
      "learning_rate": 2.5596072820445254e-07,
      "loss": 0.0879,
      "reward": 0.6289402991533279,
      "reward_std": 0.7740087658166885,
      "rewards/cosine_scaled_reward": -0.04564890172332525,
      "rewards/format_reward": 0.7202381044626236,
      "step": 377
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2566.4107666015625,
      "epoch": 1.512,
      "grad_norm": 0.5574218034744263,
      "kl": 0.310791015625,
      "learning_rate": 2.5358974294659373e-07,
      "loss": 0.0794,
      "reward": 0.5734596885740757,
      "reward_std": 0.6776000708341599,
      "rewards/cosine_scaled_reward": -0.058508249232545495,
      "rewards/format_reward": 0.6904762089252472,
      "step": 378
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2749.4642944335938,
      "epoch": 1.516,
      "grad_norm": 0.4314301908016205,
      "kl": 0.330078125,
      "learning_rate": 2.512332043064913e-07,
      "loss": 0.0655,
      "reward": 0.42858002707362175,
      "reward_std": 0.7303398549556732,
      "rewards/cosine_scaled_reward": -0.10416238568723202,
      "rewards/format_reward": 0.6369047611951828,
      "step": 379
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2403.684539794922,
      "epoch": 1.52,
      "grad_norm": 0.42673397064208984,
      "kl": 0.299560546875,
      "learning_rate": 2.488912271385139e-07,
      "loss": 0.0799,
      "reward": 0.7241241782903671,
      "reward_std": 0.7478837221860886,
      "rewards/cosine_scaled_reward": -0.006985542830079794,
      "rewards/format_reward": 0.7380952388048172,
      "step": 380
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2391.5179443359375,
      "epoch": 1.524,
      "grad_norm": 0.8130372762680054,
      "kl": 0.3203125,
      "learning_rate": 2.465639255873246e-07,
      "loss": 0.0286,
      "reward": 0.49668359011411667,
      "reward_std": 0.6931805461645126,
      "rewards/cosine_scaled_reward": -0.12665820121765137,
      "rewards/format_reward": 0.7500000149011612,
      "step": 381
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2430.5654907226562,
      "epoch": 1.528,
      "grad_norm": 0.4374740719795227,
      "kl": 0.310302734375,
      "learning_rate": 2.4425141308231765e-07,
      "loss": 0.0341,
      "reward": 0.6685771271586418,
      "reward_std": 0.8352404981851578,
      "rewards/cosine_scaled_reward": -0.016901913098990917,
      "rewards/format_reward": 0.7023809552192688,
      "step": 382
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2296.1607666015625,
      "epoch": 1.532,
      "grad_norm": 0.5494891405105591,
      "kl": 0.30224609375,
      "learning_rate": 2.4195380233209006e-07,
      "loss": 0.0859,
      "reward": 0.7063075229525566,
      "reward_std": 0.7431895136833191,
      "rewards/cosine_scaled_reward": -0.009941489901393652,
      "rewards/format_reward": 0.7261904925107956,
      "step": 383
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2517.827392578125,
      "epoch": 1.536,
      "grad_norm": 0.5410645604133606,
      "kl": 0.33203125,
      "learning_rate": 2.3967120531894857e-07,
      "loss": 0.0459,
      "reward": 0.42114658281207085,
      "reward_std": 0.6721706539392471,
      "rewards/cosine_scaled_reward": -0.12573623820208013,
      "rewards/format_reward": 0.6726190596818924,
      "step": 384
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2484.96435546875,
      "epoch": 1.54,
      "grad_norm": 0.4815540313720703,
      "kl": 0.3095703125,
      "learning_rate": 2.374037332934512e-07,
      "loss": 0.0759,
      "reward": 0.7441006675362587,
      "reward_std": 0.8601991981267929,
      "rewards/cosine_scaled_reward": 0.029788417392410338,
      "rewards/format_reward": 0.6845238208770752,
      "step": 385
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2389.9703063964844,
      "epoch": 1.544,
      "grad_norm": 0.6783538460731506,
      "kl": 0.2802734375,
      "learning_rate": 2.3515149676898552e-07,
      "loss": 0.0716,
      "reward": 0.479885321110487,
      "reward_std": 0.7240753322839737,
      "rewards/cosine_scaled_reward": -0.06958115100860596,
      "rewards/format_reward": 0.6190476417541504,
      "step": 386
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2359.964324951172,
      "epoch": 1.548,
      "grad_norm": 0.8481286764144897,
      "kl": 0.296630859375,
      "learning_rate": 2.3291460551638237e-07,
      "loss": 0.0148,
      "reward": 0.5802747337147593,
      "reward_std": 0.5601852983236313,
      "rewards/cosine_scaled_reward": -0.04617217415943742,
      "rewards/format_reward": 0.672619067132473,
      "step": 387
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2159.5655212402344,
      "epoch": 1.552,
      "grad_norm": 0.5140231251716614,
      "kl": 0.302001953125,
      "learning_rate": 2.306931685585657e-07,
      "loss": 0.063,
      "reward": 0.5727366209030151,
      "reward_std": 0.6229267343878746,
      "rewards/cosine_scaled_reward": -0.11244121752679348,
      "rewards/format_reward": 0.7976190745830536,
      "step": 388
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2253.7559814453125,
      "epoch": 1.556,
      "grad_norm": 0.4566425681114197,
      "kl": 0.292724609375,
      "learning_rate": 2.2848729416523859e-07,
      "loss": 0.0398,
      "reward": 0.6296885460615158,
      "reward_std": 0.7193648666143417,
      "rewards/cosine_scaled_reward": -0.04825095273554325,
      "rewards/format_reward": 0.7261904925107956,
      "step": 389
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2653.6250610351562,
      "epoch": 1.56,
      "grad_norm": 0.6326945424079895,
      "kl": 0.38818359375,
      "learning_rate": 2.2629708984760706e-07,
      "loss": 0.1043,
      "reward": 0.531873881816864,
      "reward_std": 0.7026529461145401,
      "rewards/cosine_scaled_reward": -0.0822773426771164,
      "rewards/format_reward": 0.696428582072258,
      "step": 390
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2613.7261962890625,
      "epoch": 1.564,
      "grad_norm": 0.398603618144989,
      "kl": 0.305908203125,
      "learning_rate": 2.2412266235313973e-07,
      "loss": 0.0464,
      "reward": 0.2553718090057373,
      "reward_std": 0.6311058104038239,
      "rewards/cosine_scaled_reward": -0.1699331346899271,
      "rewards/format_reward": 0.595238097012043,
      "step": 391
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2196.5833435058594,
      "epoch": 1.568,
      "grad_norm": 1.320838451385498,
      "kl": 0.2607421875,
      "learning_rate": 2.2196411766036487e-07,
      "loss": 0.1165,
      "reward": 0.6960010007023811,
      "reward_std": 0.8236257880926132,
      "rewards/cosine_scaled_reward": -0.044856662629172206,
      "rewards/format_reward": 0.7857143133878708,
      "step": 392
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2663.9285888671875,
      "epoch": 1.572,
      "grad_norm": 0.5183250904083252,
      "kl": 0.328857421875,
      "learning_rate": 2.1982156097370557e-07,
      "loss": 0.0708,
      "reward": 0.34957781434059143,
      "reward_std": 0.6104390919208527,
      "rewards/cosine_scaled_reward": -0.12878252286463976,
      "rewards/format_reward": 0.6071428805589676,
      "step": 393
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2630.2560424804688,
      "epoch": 1.576,
      "grad_norm": 0.2792785167694092,
      "kl": 0.34619140625,
      "learning_rate": 2.1769509671835223e-07,
      "loss": 0.0772,
      "reward": 0.45473287999629974,
      "reward_std": 0.7525355666875839,
      "rewards/cosine_scaled_reward": -0.0940621355548501,
      "rewards/format_reward": 0.6428571492433548,
      "step": 394
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2484.2679138183594,
      "epoch": 1.58,
      "grad_norm": 0.47966378927230835,
      "kl": 0.3330078125,
      "learning_rate": 2.1558482853517253e-07,
      "loss": 0.0783,
      "reward": 0.5024484526365995,
      "reward_std": 0.6865183711051941,
      "rewards/cosine_scaled_reward": -0.07615673809777945,
      "rewards/format_reward": 0.6547619104385376,
      "step": 395
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2481.96435546875,
      "epoch": 1.584,
      "grad_norm": 0.4925695061683655,
      "kl": 0.34228515625,
      "learning_rate": 2.134908592756607e-07,
      "loss": 0.1001,
      "reward": 0.6872217282652855,
      "reward_std": 0.7295544147491455,
      "rewards/cosine_scaled_reward": -0.037341527407988906,
      "rewards/format_reward": 0.761904776096344,
      "step": 396
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2154.494110107422,
      "epoch": 1.588,
      "grad_norm": 0.635874330997467,
      "kl": 0.281494140625,
      "learning_rate": 2.1141329099692406e-07,
      "loss": 0.0341,
      "reward": 0.6967096533626318,
      "reward_std": 0.7243114337325096,
      "rewards/cosine_scaled_reward": -0.04450232535600662,
      "rewards/format_reward": 0.7857143059372902,
      "step": 397
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2704.7322387695312,
      "epoch": 1.592,
      "grad_norm": 0.36841636896133423,
      "kl": 0.390625,
      "learning_rate": 2.0935222495670968e-07,
      "loss": 0.0683,
      "reward": 0.35482142120599747,
      "reward_std": 0.6981697529554367,
      "rewards/cosine_scaled_reward": -0.1410416765138507,
      "rewards/format_reward": 0.6369047611951828,
      "step": 398
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2155.1905212402344,
      "epoch": 1.596,
      "grad_norm": 0.6153831481933594,
      "kl": 0.2763671875,
      "learning_rate": 2.0730776160846853e-07,
      "loss": 0.075,
      "reward": 0.7129835858941078,
      "reward_std": 0.7049887701869011,
      "rewards/cosine_scaled_reward": -0.04827013239264488,
      "rewards/format_reward": 0.8095238208770752,
      "step": 399
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2736.9286499023438,
      "epoch": 1.6,
      "grad_norm": 0.4656315743923187,
      "kl": 0.38330078125,
      "learning_rate": 2.0528000059645995e-07,
      "loss": 0.0486,
      "reward": 0.3270074762403965,
      "reward_std": 0.6684512719511986,
      "rewards/cosine_scaled_reward": -0.17578197922557592,
      "rewards/format_reward": 0.6785714477300644,
      "step": 400
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2684.994140625,
      "epoch": 1.604,
      "grad_norm": 0.4505021274089813,
      "kl": 0.34423828125,
      "learning_rate": 2.032690407508949e-07,
      "loss": 0.0772,
      "reward": 0.5096228048205376,
      "reward_std": 0.7098966240882874,
      "rewards/cosine_scaled_reward": -0.0814981039147824,
      "rewards/format_reward": 0.6726190522313118,
      "step": 401
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2508.327423095703,
      "epoch": 1.608,
      "grad_norm": 0.3696132302284241,
      "kl": 0.34033203125,
      "learning_rate": 2.0127498008311922e-07,
      "loss": 0.0554,
      "reward": 0.6811544820666313,
      "reward_std": 0.7352585643529892,
      "rewards/cosine_scaled_reward": -0.03442276082932949,
      "rewards/format_reward": 0.7500000149011612,
      "step": 402
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2329.654815673828,
      "epoch": 1.612,
      "grad_norm": 0.5500597953796387,
      "kl": 0.310302734375,
      "learning_rate": 1.9929791578083655e-07,
      "loss": 0.0912,
      "reward": 0.5886539276689291,
      "reward_std": 0.6953590214252472,
      "rewards/cosine_scaled_reward": -0.05091113201342523,
      "rewards/format_reward": 0.6904762089252472,
      "step": 403
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2279.5238342285156,
      "epoch": 1.616,
      "grad_norm": 0.825627326965332,
      "kl": 0.314208984375,
      "learning_rate": 1.9733794420337213e-07,
      "loss": 0.0303,
      "reward": 0.4903724156320095,
      "reward_std": 0.6759866625070572,
      "rewards/cosine_scaled_reward": -0.1268376000225544,
      "rewards/format_reward": 0.7440476417541504,
      "step": 404
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2298.607208251953,
      "epoch": 1.62,
      "grad_norm": 0.7521853446960449,
      "kl": 0.300048828125,
      "learning_rate": 1.9539516087697517e-07,
      "loss": 0.0742,
      "reward": 0.6187992710620165,
      "reward_std": 0.6595779061317444,
      "rewards/cosine_scaled_reward": -0.050719428109005094,
      "rewards/format_reward": 0.7202381044626236,
      "step": 405
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2462.0416870117188,
      "epoch": 1.624,
      "grad_norm": 0.4565219581127167,
      "kl": 0.36181640625,
      "learning_rate": 1.934696604901642e-07,
      "loss": 0.0655,
      "reward": 0.7676936537027359,
      "reward_std": 0.8463387489318848,
      "rewards/cosine_scaled_reward": 0.032656354829669,
      "rewards/format_reward": 0.7023809552192688,
      "step": 406
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2593.9285888671875,
      "epoch": 1.6280000000000001,
      "grad_norm": 0.7805240154266357,
      "kl": 0.37109375,
      "learning_rate": 1.915615368891117e-07,
      "loss": 0.0336,
      "reward": 0.4898635447025299,
      "reward_std": 0.6100385710597038,
      "rewards/cosine_scaled_reward": -0.0943539384752512,
      "rewards/format_reward": 0.6785714477300644,
      "step": 407
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2179.8810119628906,
      "epoch": 1.6320000000000001,
      "grad_norm": 0.7494162321090698,
      "kl": 0.3154296875,
      "learning_rate": 1.8967088307307e-07,
      "loss": 0.0819,
      "reward": 0.5508405864238739,
      "reward_std": 0.6755202859640121,
      "rewards/cosine_scaled_reward": -0.12934163073077798,
      "rewards/format_reward": 0.8095238357782364,
      "step": 408
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2213.1131591796875,
      "epoch": 1.6360000000000001,
      "grad_norm": 0.7274454832077026,
      "kl": 0.3466796875,
      "learning_rate": 1.8779779118983867e-07,
      "loss": 0.0553,
      "reward": 0.6235681027173996,
      "reward_std": 0.6233258098363876,
      "rewards/cosine_scaled_reward": -0.10488261096179485,
      "rewards/format_reward": 0.833333358168602,
      "step": 409
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2276.5774536132812,
      "epoch": 1.6400000000000001,
      "grad_norm": 1.2181180715560913,
      "kl": 0.357421875,
      "learning_rate": 1.8594235253127372e-07,
      "loss": 0.1457,
      "reward": 0.6739452332258224,
      "reward_std": 0.7620265781879425,
      "rewards/cosine_scaled_reward": -0.0350511996075511,
      "rewards/format_reward": 0.7440476268529892,
      "step": 410
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2513.0833740234375,
      "epoch": 1.6440000000000001,
      "grad_norm": 0.3816966414451599,
      "kl": 0.37060546875,
      "learning_rate": 1.8410465752883758e-07,
      "loss": 0.0631,
      "reward": 0.503595694899559,
      "reward_std": 0.6215758174657822,
      "rewards/cosine_scaled_reward": -0.08748787135118619,
      "rewards/format_reward": 0.6785714328289032,
      "step": 411
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2348.7916870117188,
      "epoch": 1.6480000000000001,
      "grad_norm": 0.647936224937439,
      "kl": 0.3623046875,
      "learning_rate": 1.822847957491922e-07,
      "loss": 0.1147,
      "reward": 0.7675136551260948,
      "reward_std": 0.7814928591251373,
      "rewards/cosine_scaled_reward": 0.020661589689552784,
      "rewards/format_reward": 0.7261904925107956,
      "step": 412
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2329.9464721679688,
      "epoch": 1.6520000000000001,
      "grad_norm": 0.7966573238372803,
      "kl": 0.3505859375,
      "learning_rate": 1.804828558898332e-07,
      "loss": 0.0507,
      "reward": 0.7220299392938614,
      "reward_std": 0.7220810800790787,
      "rewards/cosine_scaled_reward": -0.01993740734178573,
      "rewards/format_reward": 0.761904776096344,
      "step": 413
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2510.83935546875,
      "epoch": 1.6560000000000001,
      "grad_norm": 0.3402910828590393,
      "kl": 0.36767578125,
      "learning_rate": 1.7869892577476722e-07,
      "loss": 0.0717,
      "reward": 0.6566540375351906,
      "reward_std": 0.7324352562427521,
      "rewards/cosine_scaled_reward": -0.016911087092012167,
      "rewards/format_reward": 0.6904762089252472,
      "step": 414
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2362.952423095703,
      "epoch": 1.6600000000000001,
      "grad_norm": 0.5068221688270569,
      "kl": 0.41357421875,
      "learning_rate": 1.7693309235023127e-07,
      "loss": 0.0817,
      "reward": 0.5704538598656654,
      "reward_std": 0.83931764960289,
      "rewards/cosine_scaled_reward": -0.08977308124303818,
      "rewards/format_reward": 0.7500000298023224,
      "step": 415
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2197.886962890625,
      "epoch": 1.6640000000000001,
      "grad_norm": 0.5192682147026062,
      "kl": 0.349609375,
      "learning_rate": 1.7518544168045524e-07,
      "loss": 0.0456,
      "reward": 0.7760029062628746,
      "reward_std": 0.7372387051582336,
      "rewards/cosine_scaled_reward": -0.0048556849360466,
      "rewards/format_reward": 0.7857142984867096,
      "step": 416
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2365.0655517578125,
      "epoch": 1.6680000000000001,
      "grad_norm": 0.7471702098846436,
      "kl": 0.357421875,
      "learning_rate": 1.7345605894346726e-07,
      "loss": 0.0258,
      "reward": 0.6658978462219238,
      "reward_std": 0.7144315093755722,
      "rewards/cosine_scaled_reward": -0.02419395267497748,
      "rewards/format_reward": 0.7142857313156128,
      "step": 417
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2210.7500610351562,
      "epoch": 1.6720000000000002,
      "grad_norm": 0.4305538833141327,
      "kl": 0.37060546875,
      "learning_rate": 1.7174502842694212e-07,
      "loss": 0.0715,
      "reward": 0.6991885676980019,
      "reward_std": 0.6301053613424301,
      "rewards/cosine_scaled_reward": -0.03433429542928934,
      "rewards/format_reward": 0.7678571492433548,
      "step": 418
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2631.9048461914062,
      "epoch": 1.6760000000000002,
      "grad_norm": 0.31225350499153137,
      "kl": 0.35888671875,
      "learning_rate": 1.7005243352409333e-07,
      "loss": 0.0697,
      "reward": 0.6943976636976004,
      "reward_std": 0.7306639850139618,
      "rewards/cosine_scaled_reward": 0.02874644659459591,
      "rewards/format_reward": 0.636904776096344,
      "step": 419
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2425.6607055664062,
      "epoch": 1.6800000000000002,
      "grad_norm": 0.6987324953079224,
      "kl": 0.38330078125,
      "learning_rate": 1.6837835672960831e-07,
      "loss": 0.0585,
      "reward": 0.7563354596495628,
      "reward_std": 0.7114580571651459,
      "rewards/cosine_scaled_reward": -0.038498950423672795,
      "rewards/format_reward": 0.8333333432674408,
      "step": 420
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2277.4464721679688,
      "epoch": 1.6840000000000002,
      "grad_norm": 0.34894487261772156,
      "kl": 0.373046875,
      "learning_rate": 1.6672287963562852e-07,
      "loss": 0.0935,
      "reward": 0.6365625336766243,
      "reward_std": 0.7153737097978592,
      "rewards/cosine_scaled_reward": -0.05969492206349969,
      "rewards/format_reward": 0.7559524029493332,
      "step": 421
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2504.916748046875,
      "epoch": 1.688,
      "grad_norm": 0.6229146718978882,
      "kl": 0.35498046875,
      "learning_rate": 1.6508608292777203e-07,
      "loss": 0.0181,
      "reward": 0.5784893482923508,
      "reward_std": 0.6405449658632278,
      "rewards/cosine_scaled_reward": -0.06194583047181368,
      "rewards/format_reward": 0.7023809552192688,
      "step": 422
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2308.52978515625,
      "epoch": 1.692,
      "grad_norm": 0.4013311266899109,
      "kl": 0.328125,
      "learning_rate": 1.6346804638120098e-07,
      "loss": 0.0574,
      "reward": 0.6359338611364365,
      "reward_std": 0.7808969020843506,
      "rewards/cosine_scaled_reward": -0.03917593788355589,
      "rewards/format_reward": 0.714285746216774,
      "step": 423
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2253.6785888671875,
      "epoch": 1.696,
      "grad_norm": 0.7038490176200867,
      "kl": 0.288330078125,
      "learning_rate": 1.6186884885673413e-07,
      "loss": 0.0231,
      "reward": 0.6301399618387222,
      "reward_std": 0.7140125781297684,
      "rewards/cosine_scaled_reward": -0.07481098547577858,
      "rewards/format_reward": 0.7797619104385376,
      "step": 424
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2256.464385986328,
      "epoch": 1.7,
      "grad_norm": 0.4849310517311096,
      "kl": 0.34228515625,
      "learning_rate": 1.6028856829700258e-07,
      "loss": 0.0754,
      "reward": 0.6902762800455093,
      "reward_std": 0.7732263505458832,
      "rewards/cosine_scaled_reward": -0.029861881979741156,
      "rewards/format_reward": 0.7500000149011612,
      "step": 425
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2638.3750610351562,
      "epoch": 1.704,
      "grad_norm": 0.4661174416542053,
      "kl": 0.34716796875,
      "learning_rate": 1.5872728172265146e-07,
      "loss": 0.0631,
      "reward": 0.4628839958459139,
      "reward_std": 0.6522120535373688,
      "rewards/cosine_scaled_reward": -0.06915326602756977,
      "rewards/format_reward": 0.601190485060215,
      "step": 426
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2258.279815673828,
      "epoch": 1.708,
      "grad_norm": 0.5582512021064758,
      "kl": 0.325439453125,
      "learning_rate": 1.5718506522858572e-07,
      "loss": 0.0416,
      "reward": 0.6841344758868217,
      "reward_std": 0.716413825750351,
      "rewards/cosine_scaled_reward": -0.04483753815293312,
      "rewards/format_reward": 0.7738095298409462,
      "step": 427
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2152.732208251953,
      "epoch": 1.712,
      "grad_norm": 0.362613320350647,
      "kl": 0.35546875,
      "learning_rate": 1.5566199398026147e-07,
      "loss": 0.0663,
      "reward": 0.8665853589773178,
      "reward_std": 0.746289573609829,
      "rewards/cosine_scaled_reward": 0.013649825006723404,
      "rewards/format_reward": 0.8392857164144516,
      "step": 428
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2589.8690795898438,
      "epoch": 1.716,
      "grad_norm": 0.392411470413208,
      "kl": 0.35693359375,
      "learning_rate": 1.5415814221002265e-07,
      "loss": 0.0781,
      "reward": 0.6071142517030239,
      "reward_std": 0.7519797533750534,
      "rewards/cosine_scaled_reward": -0.044657152146101,
      "rewards/format_reward": 0.696428582072258,
      "step": 429
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2220.1012573242188,
      "epoch": 1.72,
      "grad_norm": 0.5445396900177002,
      "kl": 0.341796875,
      "learning_rate": 1.5267358321348285e-07,
      "loss": 0.0534,
      "reward": 0.701055221259594,
      "reward_std": 0.6740739792585373,
      "rewards/cosine_scaled_reward": -0.04232952371239662,
      "rewards/format_reward": 0.7857142984867096,
      "step": 430
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1902.5595397949219,
      "epoch": 1.724,
      "grad_norm": 0.3450181186199188,
      "kl": 0.2607421875,
      "learning_rate": 1.5120838934595337e-07,
      "loss": 0.0472,
      "reward": 1.0963951796293259,
      "reward_std": 0.746511772274971,
      "rewards/cosine_scaled_reward": 0.11962614580988884,
      "rewards/format_reward": 0.8571428805589676,
      "step": 431
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2478.732177734375,
      "epoch": 1.728,
      "grad_norm": 0.48547589778900146,
      "kl": 0.373046875,
      "learning_rate": 1.4976263201891613e-07,
      "loss": 0.0409,
      "reward": 0.5677317231893539,
      "reward_std": 0.6051659360527992,
      "rewards/cosine_scaled_reward": -0.09113414993043989,
      "rewards/format_reward": 0.7500000149011612,
      "step": 432
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2518.7560424804688,
      "epoch": 1.732,
      "grad_norm": 0.9139849543571472,
      "kl": 0.3701171875,
      "learning_rate": 1.483363816965435e-07,
      "loss": 0.01,
      "reward": 0.6564267948269844,
      "reward_std": 0.6321954727172852,
      "rewards/cosine_scaled_reward": -0.04083424177952111,
      "rewards/format_reward": 0.7380952388048172,
      "step": 433
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2439.119110107422,
      "epoch": 1.736,
      "grad_norm": 0.4629580080509186,
      "kl": 0.3349609375,
      "learning_rate": 1.469297078922642e-07,
      "loss": 0.0828,
      "reward": 0.6547855883836746,
      "reward_std": 0.6274382770061493,
      "rewards/cosine_scaled_reward": -0.05653578881174326,
      "rewards/format_reward": 0.767857164144516,
      "step": 434
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2385.047637939453,
      "epoch": 1.74,
      "grad_norm": 0.4398196041584015,
      "kl": 0.342529296875,
      "learning_rate": 1.4554267916537495e-07,
      "loss": 0.034,
      "reward": 0.59782674908638,
      "reward_std": 0.7165493220090866,
      "rewards/cosine_scaled_reward": -0.09394377004355192,
      "rewards/format_reward": 0.7857142984867096,
      "step": 435
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2235.4405212402344,
      "epoch": 1.744,
      "grad_norm": 0.5894522070884705,
      "kl": 0.321044921875,
      "learning_rate": 1.4417536311769885e-07,
      "loss": 0.0578,
      "reward": 0.7302387952804565,
      "reward_std": 0.7528126537799835,
      "rewards/cosine_scaled_reward": -0.018809196539223194,
      "rewards/format_reward": 0.7678571492433548,
      "step": 436
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2364.2083740234375,
      "epoch": 1.748,
      "grad_norm": 0.517291784286499,
      "kl": 0.262451171875,
      "learning_rate": 1.4282782639029128e-07,
      "loss": 0.0626,
      "reward": 0.5654323399066925,
      "reward_std": 0.6579017788171768,
      "rewards/cosine_scaled_reward": -0.03276003524661064,
      "rewards/format_reward": 0.630952388048172,
      "step": 437
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2464.0238647460938,
      "epoch": 1.752,
      "grad_norm": 0.719810426235199,
      "kl": 0.34619140625,
      "learning_rate": 1.4150013466019114e-07,
      "loss": 0.0289,
      "reward": 0.5253645405173302,
      "reward_std": 0.593732014298439,
      "rewards/cosine_scaled_reward": -0.10934155760332942,
      "rewards/format_reward": 0.7440476268529892,
      "step": 438
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2688.2559814453125,
      "epoch": 1.756,
      "grad_norm": 0.47081246972084045,
      "kl": 0.320068359375,
      "learning_rate": 1.4019235263722034e-07,
      "loss": 0.0737,
      "reward": 0.5551765933632851,
      "reward_std": 0.750535324215889,
      "rewards/cosine_scaled_reward": -0.043840276543051004,
      "rewards/format_reward": 0.6428571492433548,
      "step": 439
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2691.5298461914062,
      "epoch": 1.76,
      "grad_norm": 0.3561669588088989,
      "kl": 0.30517578125,
      "learning_rate": 1.3890454406082956e-07,
      "loss": 0.0412,
      "reward": 0.6305891573429108,
      "reward_std": 0.7718498408794403,
      "rewards/cosine_scaled_reward": -0.02399112842977047,
      "rewards/format_reward": 0.6785714477300644,
      "step": 440
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2423.3274536132812,
      "epoch": 1.764,
      "grad_norm": 0.8448560237884521,
      "kl": 0.34130859375,
      "learning_rate": 1.3763677169699217e-07,
      "loss": 0.0974,
      "reward": 0.6258707121014595,
      "reward_std": 0.7022215574979782,
      "rewards/cosine_scaled_reward": -0.06504084914922714,
      "rewards/format_reward": 0.7559524029493332,
      "step": 441
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2509.6845703125,
      "epoch": 1.768,
      "grad_norm": 0.49845507740974426,
      "kl": 0.3271484375,
      "learning_rate": 1.3638909733514452e-07,
      "loss": 0.0368,
      "reward": 0.6952026858925819,
      "reward_std": 0.7732700109481812,
      "rewards/cosine_scaled_reward": -0.03037486458197236,
      "rewards/format_reward": 0.7559524029493332,
      "step": 442
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2251.6370239257812,
      "epoch": 1.772,
      "grad_norm": 1.6570687294006348,
      "kl": 0.32861328125,
      "learning_rate": 1.351615817851748e-07,
      "loss": 0.1251,
      "reward": 0.5299716778099537,
      "reward_std": 0.6262076199054718,
      "rewards/cosine_scaled_reward": -0.10108558752108365,
      "rewards/format_reward": 0.7321428507566452,
      "step": 443
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2569.8392944335938,
      "epoch": 1.776,
      "grad_norm": 0.5071792602539062,
      "kl": 0.285400390625,
      "learning_rate": 1.3395428487445914e-07,
      "loss": 0.0461,
      "reward": 0.3558937795460224,
      "reward_std": 0.6386721879243851,
      "rewards/cosine_scaled_reward": -0.12860072287730873,
      "rewards/format_reward": 0.6130952537059784,
      "step": 444
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2434.52978515625,
      "epoch": 1.78,
      "grad_norm": 0.6472364068031311,
      "kl": 0.33935546875,
      "learning_rate": 1.3276726544494571e-07,
      "loss": 0.0324,
      "reward": 0.5342502817511559,
      "reward_std": 0.7027776390314102,
      "rewards/cosine_scaled_reward": -0.08108916692435741,
      "rewards/format_reward": 0.696428582072258,
      "step": 445
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2600.7678833007812,
      "epoch": 1.784,
      "grad_norm": 0.4613596796989441,
      "kl": 0.36767578125,
      "learning_rate": 1.316005813502869e-07,
      "loss": 0.0366,
      "reward": 0.3209609054028988,
      "reward_std": 0.6079899072647095,
      "rewards/cosine_scaled_reward": -0.15499573945999146,
      "rewards/format_reward": 0.630952388048172,
      "step": 446
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2406.6131286621094,
      "epoch": 1.788,
      "grad_norm": 0.5609318017959595,
      "kl": 0.3349609375,
      "learning_rate": 1.3045428945301953e-07,
      "loss": 0.1069,
      "reward": 0.8279012702405453,
      "reward_std": 0.6648521721363068,
      "rewards/cosine_scaled_reward": 0.035974426195025444,
      "rewards/format_reward": 0.755952388048172,
      "step": 447
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2292.5238647460938,
      "epoch": 1.792,
      "grad_norm": 0.6498924493789673,
      "kl": 0.3251953125,
      "learning_rate": 1.2932844562179352e-07,
      "loss": 0.1002,
      "reward": 0.9468577206134796,
      "reward_std": 0.8284895867109299,
      "rewards/cosine_scaled_reward": 0.1073574130423367,
      "rewards/format_reward": 0.7321428656578064,
      "step": 448
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2359.7203063964844,
      "epoch": 1.796,
      "grad_norm": 0.8151586651802063,
      "kl": 0.29248046875,
      "learning_rate": 1.2822310472864885e-07,
      "loss": 0.1025,
      "reward": 0.6154885776340961,
      "reward_std": 0.643234595656395,
      "rewards/cosine_scaled_reward": -0.028565243119373918,
      "rewards/format_reward": 0.6726190596818924,
      "step": 449
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2456.5952758789062,
      "epoch": 1.8,
      "grad_norm": 0.6727307438850403,
      "kl": 0.3369140625,
      "learning_rate": 1.2713832064634125e-07,
      "loss": 0.0518,
      "reward": 0.5276128388941288,
      "reward_std": 0.6850098147988319,
      "rewards/cosine_scaled_reward": -0.05166977294720709,
      "rewards/format_reward": 0.6309523805975914,
      "step": 450
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2180.250030517578,
      "epoch": 1.804,
      "grad_norm": 0.4327715039253235,
      "kl": 0.2841796875,
      "learning_rate": 1.260741462457165e-07,
      "loss": 0.0396,
      "reward": 0.9219238460063934,
      "reward_std": 0.8085188716650009,
      "rewards/cosine_scaled_reward": 0.029414291959255934,
      "rewards/format_reward": 0.8630952537059784,
      "step": 451
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2414.9584045410156,
      "epoch": 1.808,
      "grad_norm": 0.5890040993690491,
      "kl": 0.33642578125,
      "learning_rate": 1.2503063339313356e-07,
      "loss": 0.0116,
      "reward": 0.45377534069120884,
      "reward_std": 0.6864534169435501,
      "rewards/cosine_scaled_reward": -0.07668375968933105,
      "rewards/format_reward": 0.607142873108387,
      "step": 452
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2325.2619018554688,
      "epoch": 1.812,
      "grad_norm": 0.8580995798110962,
      "kl": 0.38330078125,
      "learning_rate": 1.2400783294793668e-07,
      "loss": 0.1211,
      "reward": 0.41875267028808594,
      "reward_std": 0.5978472009301186,
      "rewards/cosine_scaled_reward": -0.17455224692821503,
      "rewards/format_reward": 0.7678571492433548,
      "step": 453
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2524.8452758789062,
      "epoch": 1.8159999999999998,
      "grad_norm": 0.6263750195503235,
      "kl": 0.369140625,
      "learning_rate": 1.2300579475997657e-07,
      "loss": 0.0599,
      "reward": 0.5164637118577957,
      "reward_std": 0.7428598999977112,
      "rewards/cosine_scaled_reward": -0.0989109962247312,
      "rewards/format_reward": 0.7142857313156128,
      "step": 454
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2222.684600830078,
      "epoch": 1.8199999999999998,
      "grad_norm": 0.46227335929870605,
      "kl": 0.257568359375,
      "learning_rate": 1.220245676671809e-07,
      "loss": 0.0452,
      "reward": 0.6773854792118073,
      "reward_std": 0.6582589149475098,
      "rewards/cosine_scaled_reward": -0.03333106730133295,
      "rewards/format_reward": 0.744047611951828,
      "step": 455
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2381.2916870117188,
      "epoch": 1.8239999999999998,
      "grad_norm": 1.395836591720581,
      "kl": 0.346923828125,
      "learning_rate": 1.2106419949317388e-07,
      "loss": -0.0036,
      "reward": 0.6790124624967575,
      "reward_std": 0.7677509784698486,
      "rewards/cosine_scaled_reward": -0.0354937631636858,
      "rewards/format_reward": 0.7500000298023224,
      "step": 456
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2236.1369018554688,
      "epoch": 1.8279999999999998,
      "grad_norm": 0.363459974527359,
      "kl": 0.359375,
      "learning_rate": 1.2012473704494537e-07,
      "loss": 0.0897,
      "reward": 0.8419362753629684,
      "reward_std": 0.8306869268417358,
      "rewards/cosine_scaled_reward": 0.034063366474583745,
      "rewards/format_reward": 0.7738095223903656,
      "step": 457
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2312.5059814453125,
      "epoch": 1.8319999999999999,
      "grad_norm": 0.5052822232246399,
      "kl": 0.35791015625,
      "learning_rate": 1.1920622611056974e-07,
      "loss": 0.0619,
      "reward": 0.744497187435627,
      "reward_std": 0.6325250118970871,
      "rewards/cosine_scaled_reward": -0.02953713061287999,
      "rewards/format_reward": 0.8035714477300644,
      "step": 458
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2242.0357971191406,
      "epoch": 1.8359999999999999,
      "grad_norm": 0.4124799966812134,
      "kl": 0.30615234375,
      "learning_rate": 1.1830871145697412e-07,
      "loss": 0.0801,
      "reward": 0.7117128595709801,
      "reward_std": 0.7263730615377426,
      "rewards/cosine_scaled_reward": -0.031048328906763345,
      "rewards/format_reward": 0.773809552192688,
      "step": 459
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2795.3512573242188,
      "epoch": 1.8399999999999999,
      "grad_norm": 0.5879099369049072,
      "kl": 0.341796875,
      "learning_rate": 1.1743223682775649e-07,
      "loss": 0.0299,
      "reward": 0.41843298077583313,
      "reward_std": 0.6329772919416428,
      "rewards/cosine_scaled_reward": -0.11518826894462109,
      "rewards/format_reward": 0.6488095223903656,
      "step": 460
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2640.6904907226562,
      "epoch": 1.8439999999999999,
      "grad_norm": 0.3294979929924011,
      "kl": 0.38818359375,
      "learning_rate": 1.1657684494105386e-07,
      "loss": 0.0662,
      "reward": 0.6549192667007446,
      "reward_std": 0.8022814393043518,
      "rewards/cosine_scaled_reward": -0.02373085916042328,
      "rewards/format_reward": 0.7023809552192688,
      "step": 461
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2345.4285888671875,
      "epoch": 1.8479999999999999,
      "grad_norm": 0.3771924376487732,
      "kl": 0.35595703125,
      "learning_rate": 1.1574257748745986e-07,
      "loss": 0.105,
      "reward": 0.6399585753679276,
      "reward_std": 0.7968022599816322,
      "rewards/cosine_scaled_reward": -0.08478261809796095,
      "rewards/format_reward": 0.8095238208770752,
      "step": 462
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2589.1666870117188,
      "epoch": 1.8519999999999999,
      "grad_norm": 0.3122951090335846,
      "kl": 0.3798828125,
      "learning_rate": 1.1492947512799328e-07,
      "loss": 0.0565,
      "reward": 0.43525535613298416,
      "reward_std": 0.7020779103040695,
      "rewards/cosine_scaled_reward": -0.09784852154552937,
      "rewards/format_reward": 0.6309523954987526,
      "step": 463
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2481.261962890625,
      "epoch": 1.8559999999999999,
      "grad_norm": 0.5730969905853271,
      "kl": 0.33984375,
      "learning_rate": 1.1413757749211602e-07,
      "loss": 0.0699,
      "reward": 0.7782276198267937,
      "reward_std": 0.6072199195623398,
      "rewards/cosine_scaled_reward": 0.02304239757359028,
      "rewards/format_reward": 0.7321428656578064,
      "step": 464
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2644.619140625,
      "epoch": 1.8599999999999999,
      "grad_norm": 0.39649447798728943,
      "kl": 0.373046875,
      "learning_rate": 1.1336692317580158e-07,
      "loss": 0.0712,
      "reward": 0.6695687249302864,
      "reward_std": 0.8249562680721283,
      "rewards/cosine_scaled_reward": 0.007403409108519554,
      "rewards/format_reward": 0.6547619178891182,
      "step": 465
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2491.0119018554688,
      "epoch": 1.8639999999999999,
      "grad_norm": 0.4567016065120697,
      "kl": 0.36376953125,
      "learning_rate": 1.1261754973965422e-07,
      "loss": 0.0702,
      "reward": 0.46852924674749374,
      "reward_std": 0.7603975385427475,
      "rewards/cosine_scaled_reward": -0.1199020454660058,
      "rewards/format_reward": 0.708333358168602,
      "step": 466
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2532.6607666015625,
      "epoch": 1.8679999999999999,
      "grad_norm": 0.36717355251312256,
      "kl": 0.330078125,
      "learning_rate": 1.1188949370707787e-07,
      "loss": 0.0491,
      "reward": 0.5859625339508057,
      "reward_std": 0.6559573635458946,
      "rewards/cosine_scaled_reward": -0.028447304794099182,
      "rewards/format_reward": 0.6428571492433548,
      "step": 467
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2622.2559814453125,
      "epoch": 1.8719999999999999,
      "grad_norm": 0.4103451669216156,
      "kl": 0.36279296875,
      "learning_rate": 1.1118279056249653e-07,
      "loss": 0.0606,
      "reward": 0.6576881408691406,
      "reward_std": 0.8001267910003662,
      "rewards/cosine_scaled_reward": -0.007465461269021034,
      "rewards/format_reward": 0.6726190596818924,
      "step": 468
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2134.279815673828,
      "epoch": 1.876,
      "grad_norm": 0.5138155817985535,
      "kl": 0.256103515625,
      "learning_rate": 1.1049747474962444e-07,
      "loss": 0.0759,
      "reward": 0.6648233011364937,
      "reward_std": 0.7618712484836578,
      "rewards/cosine_scaled_reward": -0.06044549681246281,
      "rewards/format_reward": 0.7857143133878708,
      "step": 469
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2160.202392578125,
      "epoch": 1.88,
      "grad_norm": 0.3472672700881958,
      "kl": 0.30126953125,
      "learning_rate": 1.0983357966978745e-07,
      "loss": 0.0461,
      "reward": 0.7422109395265579,
      "reward_std": 0.6609758958220482,
      "rewards/cosine_scaled_reward": -0.012823125813156366,
      "rewards/format_reward": 0.767857164144516,
      "step": 470
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2532.3809814453125,
      "epoch": 1.884,
      "grad_norm": 0.2868484556674957,
      "kl": 0.2763671875,
      "learning_rate": 1.0919113768029517e-07,
      "loss": 0.0493,
      "reward": 0.48562416061758995,
      "reward_std": 0.7373960316181183,
      "rewards/cosine_scaled_reward": -0.08159269354655407,
      "rewards/format_reward": 0.648809552192688,
      "step": 471
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2409.3392944335938,
      "epoch": 1.888,
      "grad_norm": 0.4656960964202881,
      "kl": 0.390380859375,
      "learning_rate": 1.0857018009286381e-07,
      "loss": 0.0561,
      "reward": 0.6008469248190522,
      "reward_std": 0.6282935440540314,
      "rewards/cosine_scaled_reward": -0.05374322272837162,
      "rewards/format_reward": 0.708333358168602,
      "step": 472
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2294.0178833007812,
      "epoch": 1.892,
      "grad_norm": 0.5274000763893127,
      "kl": 0.3369140625,
      "learning_rate": 1.0797073717209013e-07,
      "loss": 0.0877,
      "reward": 0.7744100838899612,
      "reward_std": 0.7935537397861481,
      "rewards/cosine_scaled_reward": 0.00030028633773326874,
      "rewards/format_reward": 0.7738095372915268,
      "step": 473
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2676.6727294921875,
      "epoch": 1.896,
      "grad_norm": 0.5600417256355286,
      "kl": 0.302978515625,
      "learning_rate": 1.0739283813397639e-07,
      "loss": 0.0338,
      "reward": 0.47137061692774296,
      "reward_std": 0.7821067273616791,
      "rewards/cosine_scaled_reward": -0.07681469712406397,
      "rewards/format_reward": 0.6250000074505806,
      "step": 474
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2442.0952758789062,
      "epoch": 1.9,
      "grad_norm": 0.5208225846290588,
      "kl": 0.32470703125,
      "learning_rate": 1.068365111445064e-07,
      "loss": 0.0837,
      "reward": 0.38532300293445587,
      "reward_std": 0.5505756810307503,
      "rewards/cosine_scaled_reward": -0.1287670750170946,
      "rewards/format_reward": 0.6428571492433548,
      "step": 475
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2030.7857360839844,
      "epoch": 1.904,
      "grad_norm": 0.6633386611938477,
      "kl": 0.269287109375,
      "learning_rate": 1.063017833182728e-07,
      "loss": 0.0183,
      "reward": 0.796258918941021,
      "reward_std": 0.8103819191455841,
      "rewards/cosine_scaled_reward": 0.017177060712128878,
      "rewards/format_reward": 0.761904776096344,
      "step": 476
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2734.2262573242188,
      "epoch": 1.908,
      "grad_norm": 0.5043067932128906,
      "kl": 0.35791015625,
      "learning_rate": 1.0578868071715544e-07,
      "loss": 0.0378,
      "reward": 0.2551159653812647,
      "reward_std": 0.5920611470937729,
      "rewards/cosine_scaled_reward": -0.155180131085217,
      "rewards/format_reward": 0.5654762089252472,
      "step": 477
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2301.0952758789062,
      "epoch": 1.912,
      "grad_norm": 0.7771977186203003,
      "kl": 0.28662109375,
      "learning_rate": 1.0529722834905125e-07,
      "loss": 0.0228,
      "reward": 0.6790298409759998,
      "reward_std": 0.6661486774682999,
      "rewards/cosine_scaled_reward": -0.06524700409499928,
      "rewards/format_reward": 0.8095238357782364,
      "step": 478
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2247.6012573242188,
      "epoch": 1.916,
      "grad_norm": 0.599141001701355,
      "kl": 0.3212890625,
      "learning_rate": 1.0482745016665526e-07,
      "loss": 0.0857,
      "reward": 0.8667033798992634,
      "reward_std": 0.8036679923534393,
      "rewards/cosine_scaled_reward": 0.06132788397371769,
      "rewards/format_reward": 0.7440476268529892,
      "step": 479
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2093.3333740234375,
      "epoch": 1.92,
      "grad_norm": 0.5312609076499939,
      "kl": 0.2958984375,
      "learning_rate": 1.0437936906629334e-07,
      "loss": 0.0735,
      "reward": 0.7348574697971344,
      "reward_std": 0.689183309674263,
      "rewards/cosine_scaled_reward": -0.034356983145698905,
      "rewards/format_reward": 0.8035714328289032,
      "step": 480
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2448.1726684570312,
      "epoch": 1.924,
      "grad_norm": 0.5402917861938477,
      "kl": 0.36669921875,
      "learning_rate": 1.0395300688680625e-07,
      "loss": 0.0831,
      "reward": 0.43995974212884903,
      "reward_std": 0.6862698197364807,
      "rewards/cosine_scaled_reward": -0.13121061958372593,
      "rewards/format_reward": 0.70238097012043,
      "step": 481
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2658.7977294921875,
      "epoch": 1.928,
      "grad_norm": 0.5909121632575989,
      "kl": 0.3623046875,
      "learning_rate": 1.0354838440848501e-07,
      "loss": 0.0547,
      "reward": 0.5179771184921265,
      "reward_std": 0.7944772690534592,
      "rewards/cosine_scaled_reward": -0.0981543204979971,
      "rewards/format_reward": 0.7142857313156128,
      "step": 482
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2292.1428833007812,
      "epoch": 1.932,
      "grad_norm": 0.549201488494873,
      "kl": 0.30615234375,
      "learning_rate": 1.0316552135205837e-07,
      "loss": 0.0906,
      "reward": 0.6546563804149628,
      "reward_std": 0.7558221146464348,
      "rewards/cosine_scaled_reward": -0.017909929156303406,
      "rewards/format_reward": 0.690476194024086,
      "step": 483
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2001.2381286621094,
      "epoch": 1.936,
      "grad_norm": 0.9190180897712708,
      "kl": 0.2490234375,
      "learning_rate": 1.0280443637773163e-07,
      "loss": -0.0131,
      "reward": 0.6368911117315292,
      "reward_std": 0.5770624950528145,
      "rewards/cosine_scaled_reward": -0.07441157009452581,
      "rewards/format_reward": 0.7857142835855484,
      "step": 484
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2369.482177734375,
      "epoch": 1.94,
      "grad_norm": 0.48303577303886414,
      "kl": 0.32861328125,
      "learning_rate": 1.0246514708427701e-07,
      "loss": 0.0806,
      "reward": 0.5949805751442909,
      "reward_std": 0.7235869467258453,
      "rewards/cosine_scaled_reward": -0.04179543023929,
      "rewards/format_reward": 0.6785714328289032,
      "step": 485
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2607.375030517578,
      "epoch": 1.944,
      "grad_norm": 0.45922327041625977,
      "kl": 0.334228515625,
      "learning_rate": 1.0214767000817596e-07,
      "loss": 0.0495,
      "reward": 0.4739008713513613,
      "reward_std": 0.7411531507968903,
      "rewards/cosine_scaled_reward": -0.10828767996281385,
      "rewards/format_reward": 0.690476194024086,
      "step": 486
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2427.3929443359375,
      "epoch": 1.948,
      "grad_norm": 0.42017099261283875,
      "kl": 0.281494140625,
      "learning_rate": 1.0185202062281336e-07,
      "loss": 0.0322,
      "reward": 0.3904539607465267,
      "reward_std": 0.6817184686660767,
      "rewards/cosine_scaled_reward": -0.13810635451227427,
      "rewards/format_reward": 0.6666666716337204,
      "step": 487
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2579.0120239257812,
      "epoch": 1.952,
      "grad_norm": 0.7048377394676208,
      "kl": 0.322265625,
      "learning_rate": 1.0157821333772304e-07,
      "loss": 0.028,
      "reward": 0.5259524993598461,
      "reward_std": 0.6612162664532661,
      "rewards/cosine_scaled_reward": -0.09714281000196934,
      "rewards/format_reward": 0.7202381193637848,
      "step": 488
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2246.9524536132812,
      "epoch": 1.956,
      "grad_norm": 0.4814748167991638,
      "kl": 0.313720703125,
      "learning_rate": 1.013262614978859e-07,
      "loss": 0.0807,
      "reward": 0.7873745709657669,
      "reward_std": 0.7711023241281509,
      "rewards/cosine_scaled_reward": -0.005122252739965916,
      "rewards/format_reward": 0.7976190745830536,
      "step": 489
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2849.761962890625,
      "epoch": 1.96,
      "grad_norm": 0.5227950215339661,
      "kl": 0.37890625,
      "learning_rate": 1.0109617738307911e-07,
      "loss": 0.0171,
      "reward": 0.4944131616503,
      "reward_std": 0.6706523001194,
      "rewards/cosine_scaled_reward": -0.06826960667967796,
      "rewards/format_reward": 0.6309524029493332,
      "step": 490
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2594.3631591796875,
      "epoch": 1.964,
      "grad_norm": 0.5116230249404907,
      "kl": 0.40576171875,
      "learning_rate": 1.0088797220727779e-07,
      "loss": 0.0511,
      "reward": 0.40869739279150963,
      "reward_std": 0.703234076499939,
      "rewards/cosine_scaled_reward": -0.12600845471024513,
      "rewards/format_reward": 0.6607142984867096,
      "step": 491
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2416.8632202148438,
      "epoch": 1.968,
      "grad_norm": 0.9567095637321472,
      "kl": 0.27783203125,
      "learning_rate": 1.0070165611810855e-07,
      "loss": 0.1235,
      "reward": 0.5404094010591507,
      "reward_std": 0.6638472378253937,
      "rewards/cosine_scaled_reward": -0.054200079292058945,
      "rewards/format_reward": 0.6488095372915268,
      "step": 492
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2514.1666870117188,
      "epoch": 1.972,
      "grad_norm": 0.4846276044845581,
      "kl": 0.28466796875,
      "learning_rate": 1.005372381963547e-07,
      "loss": 0.0436,
      "reward": 0.5605661012232304,
      "reward_std": 0.6418938338756561,
      "rewards/cosine_scaled_reward": -0.03519314527511597,
      "rewards/format_reward": 0.630952388048172,
      "step": 493
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2289.71435546875,
      "epoch": 1.976,
      "grad_norm": 0.6296063661575317,
      "kl": 0.3212890625,
      "learning_rate": 1.0039472645551372e-07,
      "loss": 0.0439,
      "reward": 0.6024229377508163,
      "reward_std": 0.7128957360982895,
      "rewards/cosine_scaled_reward": -0.07081234554061666,
      "rewards/format_reward": 0.744047611951828,
      "step": 494
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2491.3334045410156,
      "epoch": 1.98,
      "grad_norm": 0.622008204460144,
      "kl": 0.283447265625,
      "learning_rate": 1.002741278414069e-07,
      "loss": 0.0361,
      "reward": 0.594695046544075,
      "reward_std": 0.6841184943914413,
      "rewards/cosine_scaled_reward": -0.03896199120208621,
      "rewards/format_reward": 0.6726190745830536,
      "step": 495
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2395.636962890625,
      "epoch": 1.984,
      "grad_norm": 0.30918648838996887,
      "kl": 0.29931640625,
      "learning_rate": 1.0017544823184055e-07,
      "loss": 0.0827,
      "reward": 0.5910248765721917,
      "reward_std": 0.6182541996240616,
      "rewards/cosine_scaled_reward": -0.05567805375903845,
      "rewards/format_reward": 0.70238097012043,
      "step": 496
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2368.8631591796875,
      "epoch": 1.988,
      "grad_norm": 1.1213865280151367,
      "kl": 0.3515625,
      "learning_rate": 1.0009869243631952e-07,
      "loss": 0.0439,
      "reward": 0.48846913129091263,
      "reward_std": 0.6297848075628281,
      "rewards/cosine_scaled_reward": -0.13374162535183132,
      "rewards/format_reward": 0.755952388048172,
      "step": 497
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2289.3690795898438,
      "epoch": 1.992,
      "grad_norm": 0.810573399066925,
      "kl": 0.302734375,
      "learning_rate": 1.000438641958131e-07,
      "loss": 0.0228,
      "reward": 0.6517780050635338,
      "reward_std": 0.7580654174089432,
      "rewards/cosine_scaled_reward": -0.046134804193570744,
      "rewards/format_reward": 0.7440476417541504,
      "step": 498
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2452.2738647460938,
      "epoch": 1.996,
      "grad_norm": 0.357543408870697,
      "kl": 0.32763671875,
      "learning_rate": 1.0001096618257236e-07,
      "loss": 0.0643,
      "reward": 0.3793360572308302,
      "reward_std": 0.7790006846189499,
      "rewards/cosine_scaled_reward": -0.15557007491588593,
      "rewards/format_reward": 0.6904762089252472,
      "step": 499
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2528.7500610351562,
      "epoch": 2.0,
      "grad_norm": 0.9307948350906372,
      "kl": 0.306640625,
      "learning_rate": 1e-07,
      "loss": 0.1364,
      "reward": 0.5999207645654678,
      "reward_std": 0.6981495916843414,
      "rewards/cosine_scaled_reward": -0.03634915268048644,
      "rewards/format_reward": 0.6726190596818924,
      "step": 500
    },
    {
      "epoch": 2.0,
      "step": 500,
      "total_flos": 0.0,
      "train_loss": 0.0725239302306436,
      "train_runtime": 62033.0192,
      "train_samples_per_second": 1.354,
      "train_steps_per_second": 0.008
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 6,
  "trial_name": null,
  "trial_params": null
}