{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.856898029134533,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 2770.8472290039062,
      "epoch": 0.001713796058269066,
      "grad_norm": 0.15192405879497528,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": 0.014,
      "reward": -0.06689765583723783,
      "reward_std": 0.505804143846035,
      "rewards/cosine_scaled_reward": -0.03344883490353823,
      "rewards/format_reward": 0.0,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2785.013916015625,
      "epoch": 0.003427592116538132,
      "grad_norm": 0.1657538264989853,
      "kl": 0.0,
      "learning_rate": 2e-08,
      "loss": -0.0211,
      "reward": -0.4646243788301945,
      "reward_std": 0.39301297068595886,
      "rewards/cosine_scaled_reward": -0.23231217823922634,
      "rewards/format_reward": 0.0,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2713.027801513672,
      "epoch": 0.005141388174807198,
      "grad_norm": 0.1747598648071289,
      "kl": 3.5196542739868164e-05,
      "learning_rate": 4e-08,
      "loss": -0.0275,
      "reward": -0.23865782655775547,
      "reward_std": 0.4481763616204262,
      "rewards/cosine_scaled_reward": -0.11932891746982932,
      "rewards/format_reward": 0.0,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2938.5277709960938,
      "epoch": 0.006855184233076264,
      "grad_norm": 0.16107600927352905,
      "kl": 3.7282705307006836e-05,
      "learning_rate": 6e-08,
      "loss": -0.0289,
      "reward": 0.06913903169333935,
      "reward_std": 0.6892540901899338,
      "rewards/cosine_scaled_reward": 0.03456950932741165,
      "rewards/format_reward": 0.0,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2532.7222290039062,
      "epoch": 0.00856898029134533,
      "grad_norm": 0.15964782238006592,
      "kl": 2.065300941467285e-05,
      "learning_rate": 8e-08,
      "loss": -0.0052,
      "reward": -0.15601756004616618,
      "reward_std": 0.5161308571696281,
      "rewards/cosine_scaled_reward": -0.07800877187401056,
      "rewards/format_reward": 0.0,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3131.25,
      "epoch": 0.010282776349614395,
      "grad_norm": 0.13910692930221558,
      "kl": 4.1961669921875e-05,
      "learning_rate": 1e-07,
      "loss": 0.029,
      "reward": -0.13883829297265038,
      "reward_std": 0.5291023775935173,
      "rewards/cosine_scaled_reward": -0.06941914733033627,
      "rewards/format_reward": 0.0,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2258.6944885253906,
      "epoch": 0.011996572407883462,
      "grad_norm": 0.21499329805374146,
      "kl": 3.059208393096924e-05,
      "learning_rate": 1.2e-07,
      "loss": -0.0297,
      "reward": -0.22816578298807144,
      "reward_std": 0.5721099078655243,
      "rewards/cosine_scaled_reward": -0.11408288218080997,
      "rewards/format_reward": 0.0,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3106.65283203125,
      "epoch": 0.013710368466152529,
      "grad_norm": 0.15807782113552094,
      "kl": 3.281235694885254e-05,
      "learning_rate": 1.4e-07,
      "loss": 0.0518,
      "reward": -0.1028524599969387,
      "reward_std": 0.7277905195951462,
      "rewards/cosine_scaled_reward": -0.051426228135824203,
      "rewards/format_reward": 0.0,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2652.2777709960938,
      "epoch": 0.015424164524421594,
      "grad_norm": 0.14988838136196136,
      "kl": 3.746151924133301e-05,
      "learning_rate": 1.6e-07,
      "loss": -0.0052,
      "reward": -0.04764566984522389,
      "reward_std": 0.6422684416174889,
      "rewards/cosine_scaled_reward": -0.023822834249585867,
      "rewards/format_reward": 0.0,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2956.250030517578,
      "epoch": 0.01713796058269066,
      "grad_norm": 0.15577340126037598,
      "kl": 3.62396240234375e-05,
      "learning_rate": 1.8e-07,
      "loss": 0.0369,
      "reward": -0.09274669736623764,
      "reward_std": 0.6059432476758957,
      "rewards/cosine_scaled_reward": -0.046373344492167234,
      "rewards/format_reward": 0.0,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2610.430633544922,
      "epoch": 0.018851756640959727,
      "grad_norm": 0.18031956255435944,
      "kl": 2.753734588623047e-05,
      "learning_rate": 2e-07,
      "loss": 0.0126,
      "reward": 0.17614622993642115,
      "reward_std": 0.7455325201153755,
      "rewards/cosine_scaled_reward": 0.08807311341661261,
      "rewards/format_reward": 0.0,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2977.2638549804688,
      "epoch": 0.02056555269922879,
      "grad_norm": 0.15254004299640656,
      "kl": 3.084540367126465e-05,
      "learning_rate": 2.1999999999999998e-07,
      "loss": -0.0238,
      "reward": -0.2835669822525233,
      "reward_std": 0.6270563155412674,
      "rewards/cosine_scaled_reward": -0.14178348786663264,
      "rewards/format_reward": 0.0,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2601.7916870117188,
      "epoch": 0.022279348757497857,
      "grad_norm": 0.1897689402103424,
      "kl": 4.309415817260742e-05,
      "learning_rate": 2.4e-07,
      "loss": -0.008,
      "reward": -0.08701697085052729,
      "reward_std": 0.6209904551506042,
      "rewards/cosine_scaled_reward": -0.04350848635658622,
      "rewards/format_reward": 0.0,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2891.3472290039062,
      "epoch": 0.023993144815766924,
      "grad_norm": 0.17451944947242737,
      "kl": 3.2007694244384766e-05,
      "learning_rate": 2.6e-07,
      "loss": 0.0134,
      "reward": -0.11856314726173878,
      "reward_std": 0.5714613646268845,
      "rewards/cosine_scaled_reward": -0.059281568974256516,
      "rewards/format_reward": 0.0,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3376.9444580078125,
      "epoch": 0.02570694087403599,
      "grad_norm": 0.19522128999233246,
      "kl": 4.00543212890625e-05,
      "learning_rate": 2.8e-07,
      "loss": 0.0625,
      "reward": -0.3375568427145481,
      "reward_std": 0.5690607726573944,
      "rewards/cosine_scaled_reward": -0.16877843253314495,
      "rewards/format_reward": 0.0,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2385.9861450195312,
      "epoch": 0.027420736932305057,
      "grad_norm": 0.17156164348125458,
      "kl": 3.203749656677246e-05,
      "learning_rate": 3e-07,
      "loss": 0.0479,
      "reward": 0.31096187606453896,
      "reward_std": 0.719051368534565,
      "rewards/cosine_scaled_reward": 0.15548093989491463,
      "rewards/format_reward": 0.0,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2834.4166870117188,
      "epoch": 0.02913453299057412,
      "grad_norm": 0.1916762739419937,
      "kl": 3.910064697265625e-05,
      "learning_rate": 3.2e-07,
      "loss": 0.0288,
      "reward": -0.1371638989658095,
      "reward_std": 0.43335365504026413,
      "rewards/cosine_scaled_reward": -0.06858194415690377,
      "rewards/format_reward": 0.0,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3107.9166870117188,
      "epoch": 0.030848329048843187,
      "grad_norm": 0.20290644466876984,
      "kl": 3.56137752532959e-05,
      "learning_rate": 3.4000000000000003e-07,
      "loss": 0.0182,
      "reward": -0.2907893192023039,
      "reward_std": 0.43716832995414734,
      "rewards/cosine_scaled_reward": -0.14539465866982937,
      "rewards/format_reward": 0.0,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3065.611083984375,
      "epoch": 0.032562125107112254,
      "grad_norm": 0.1492234468460083,
      "kl": 4.0084123611450195e-05,
      "learning_rate": 3.6e-07,
      "loss": 0.0216,
      "reward": -0.19093798706308007,
      "reward_std": 0.7698801159858704,
      "rewards/cosine_scaled_reward": -0.09546899236738682,
      "rewards/format_reward": 0.0,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3355.7222900390625,
      "epoch": 0.03427592116538132,
      "grad_norm": 0.14321106672286987,
      "kl": 3.36766242980957e-05,
      "learning_rate": 3.7999999999999996e-07,
      "loss": -0.0048,
      "reward": -0.2757381685078144,
      "reward_std": 0.5536239072680473,
      "rewards/cosine_scaled_reward": -0.1378690842539072,
      "rewards/format_reward": 0.0,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2938.125,
      "epoch": 0.03598971722365039,
      "grad_norm": 0.20512644946575165,
      "kl": 4.1961669921875e-05,
      "learning_rate": 4e-07,
      "loss": 0.0577,
      "reward": -0.1858626427128911,
      "reward_std": 0.6686508804559708,
      "rewards/cosine_scaled_reward": -0.09293132461607456,
      "rewards/format_reward": 0.0,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3192.2361450195312,
      "epoch": 0.037703513281919454,
      "grad_norm": 0.13245940208435059,
      "kl": 3.49879264831543e-05,
      "learning_rate": 4.1999999999999995e-07,
      "loss": 0.0372,
      "reward": -0.186855623498559,
      "reward_std": 0.5942067578434944,
      "rewards/cosine_scaled_reward": -0.09342780988663435,
      "rewards/format_reward": 0.0,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3075.02783203125,
      "epoch": 0.03941730934018852,
      "grad_norm": 0.14223958551883698,
      "kl": 2.8640031814575195e-05,
      "learning_rate": 4.3999999999999997e-07,
      "loss": -0.0208,
      "reward": -0.4465179964900017,
      "reward_std": 0.36973506212234497,
      "rewards/cosine_scaled_reward": -0.223259000107646,
      "rewards/format_reward": 0.0,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2707.6250610351562,
      "epoch": 0.04113110539845758,
      "grad_norm": 0.20090773701667786,
      "kl": 2.9414892196655273e-05,
      "learning_rate": 4.6e-07,
      "loss": 0.0292,
      "reward": 0.08563654706813395,
      "reward_std": 0.4666801244020462,
      "rewards/cosine_scaled_reward": 0.04281827830709517,
      "rewards/format_reward": 0.0,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2578.9443969726562,
      "epoch": 0.04284490145672665,
      "grad_norm": 0.19762183725833893,
      "kl": 2.6911497116088867e-05,
      "learning_rate": 4.8e-07,
      "loss": 0.0547,
      "reward": -0.15825002267956734,
      "reward_std": 0.6721501722931862,
      "rewards/cosine_scaled_reward": -0.07912501133978367,
      "rewards/format_reward": 0.0,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3199.1111450195312,
      "epoch": 0.044558697514995714,
      "grad_norm": 0.14947673678398132,
      "kl": 3.1381845474243164e-05,
      "learning_rate": 5e-07,
      "loss": 0.0771,
      "reward": -0.3339938232675195,
      "reward_std": 0.5660227835178375,
      "rewards/cosine_scaled_reward": -0.16699691163375974,
      "rewards/format_reward": 0.0,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3103.3193969726562,
      "epoch": 0.04627249357326478,
      "grad_norm": 0.12868770956993103,
      "kl": 2.6017427444458008e-05,
      "learning_rate": 5.2e-07,
      "loss": 0.0118,
      "reward": -0.2791058011353016,
      "reward_std": 0.49328897148370743,
      "rewards/cosine_scaled_reward": -0.13955289125442505,
      "rewards/format_reward": 0.0,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2378.2222595214844,
      "epoch": 0.04798628963153385,
      "grad_norm": 0.2462579607963562,
      "kl": 2.7805566787719727e-05,
      "learning_rate": 5.4e-07,
      "loss": 0.0596,
      "reward": 0.03218653332442045,
      "reward_std": 0.6807225868105888,
      "rewards/cosine_scaled_reward": 0.016093265498057008,
      "rewards/format_reward": 0.0,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2971.291748046875,
      "epoch": 0.049700085689802914,
      "grad_norm": 0.16591639816761017,
      "kl": 3.515183925628662e-05,
      "learning_rate": 5.6e-07,
      "loss": 0.0141,
      "reward": 0.011478596366941929,
      "reward_std": 0.7397755682468414,
      "rewards/cosine_scaled_reward": 0.005739298183470964,
      "rewards/format_reward": 0.0,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2913.3611450195312,
      "epoch": 0.05141388174807198,
      "grad_norm": 0.13886681199073792,
      "kl": 3.2573938369750977e-05,
      "learning_rate": 5.8e-07,
      "loss": 0.0258,
      "reward": 0.05036446265876293,
      "reward_std": 0.6957473307847977,
      "rewards/cosine_scaled_reward": 0.025182233192026615,
      "rewards/format_reward": 0.0,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2665.041748046875,
      "epoch": 0.05312767780634105,
      "grad_norm": 0.16625739634037018,
      "kl": 2.000480890274048e-05,
      "learning_rate": 6e-07,
      "loss": 0.0045,
      "reward": -0.044122666819021106,
      "reward_std": 0.4255269840359688,
      "rewards/cosine_scaled_reward": -0.022061329917050898,
      "rewards/format_reward": 0.0,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2951.8611450195312,
      "epoch": 0.054841473864610114,
      "grad_norm": 0.15594074130058289,
      "kl": 1.9147992134094238e-05,
      "learning_rate": 6.2e-07,
      "loss": 0.0942,
      "reward": -0.3072533793747425,
      "reward_std": 0.4980456754565239,
      "rewards/cosine_scaled_reward": -0.15362668968737125,
      "rewards/format_reward": 0.0,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2260.0833435058594,
      "epoch": 0.056555269922879174,
      "grad_norm": 0.21370142698287964,
      "kl": 3.5665929317474365e-05,
      "learning_rate": 6.4e-07,
      "loss": 0.0063,
      "reward": 0.06617816537618637,
      "reward_std": 0.5614925771951675,
      "rewards/cosine_scaled_reward": 0.033089087810367346,
      "rewards/format_reward": 0.0,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2807.013916015625,
      "epoch": 0.05826906598114824,
      "grad_norm": 0.20051412284374237,
      "kl": 1.2192875146865845e-05,
      "learning_rate": 6.6e-07,
      "loss": 0.0328,
      "reward": -0.17473484575748444,
      "reward_std": 0.6600858569145203,
      "rewards/cosine_scaled_reward": -0.08736742846667767,
      "rewards/format_reward": 0.0,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3120.7500610351562,
      "epoch": 0.05998286203941731,
      "grad_norm": 0.13361996412277222,
      "kl": 3.407895565032959e-05,
      "learning_rate": 6.800000000000001e-07,
      "loss": 0.0472,
      "reward": -0.4979929216206074,
      "reward_std": 0.39260104298591614,
      "rewards/cosine_scaled_reward": -0.2489964533597231,
      "rewards/format_reward": 0.0,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2625.0694885253906,
      "epoch": 0.061696658097686374,
      "grad_norm": 0.16467803716659546,
      "kl": 2.7239322662353516e-05,
      "learning_rate": 7e-07,
      "loss": -0.0168,
      "reward": -0.35937849269248545,
      "reward_std": 0.45373768359422684,
      "rewards/cosine_scaled_reward": -0.1796892363927327,
      "rewards/format_reward": 0.0,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3042.1806030273438,
      "epoch": 0.06341045415595545,
      "grad_norm": 0.15104345977306366,
      "kl": 2.9146671295166016e-05,
      "learning_rate": 7.2e-07,
      "loss": 0.0068,
      "reward": -0.37954360246658325,
      "reward_std": 0.5432159453630447,
      "rewards/cosine_scaled_reward": -0.18977180123329163,
      "rewards/format_reward": 0.0,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3193.2083740234375,
      "epoch": 0.06512425021422451,
      "grad_norm": 0.20619741082191467,
      "kl": 1.7097219824790955e-05,
      "learning_rate": 7.4e-07,
      "loss": 0.0389,
      "reward": -0.29821273358538747,
      "reward_std": 0.5581861883401871,
      "rewards/cosine_scaled_reward": -0.1491063602734357,
      "rewards/format_reward": 0.0,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3018.5834045410156,
      "epoch": 0.06683804627249357,
      "grad_norm": 0.12940338253974915,
      "kl": 3.90857458114624e-05,
      "learning_rate": 7.599999999999999e-07,
      "loss": 0.0162,
      "reward": -0.25728118792176247,
      "reward_std": 0.34478260576725006,
      "rewards/cosine_scaled_reward": -0.12864059768617153,
      "rewards/format_reward": 0.0,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2860.7500610351562,
      "epoch": 0.06855184233076264,
      "grad_norm": 0.25654301047325134,
      "kl": 0.0001112818717956543,
      "learning_rate": 7.799999999999999e-07,
      "loss": 0.0545,
      "reward": 0.13069207593798637,
      "reward_std": 0.5447051227092743,
      "rewards/cosine_scaled_reward": 0.06534605007618666,
      "rewards/format_reward": 0.0,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2696.3611450195312,
      "epoch": 0.0702656383890317,
      "grad_norm": 0.19896458089351654,
      "kl": 4.2378902435302734e-05,
      "learning_rate": 8e-07,
      "loss": 0.0826,
      "reward": 0.2564197585452348,
      "reward_std": 0.6877201497554779,
      "rewards/cosine_scaled_reward": 0.12820987740997225,
      "rewards/format_reward": 0.0,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2642.3333740234375,
      "epoch": 0.07197943444730077,
      "grad_norm": 0.1658892035484314,
      "kl": 0.00020813941955566406,
      "learning_rate": 8.199999999999999e-07,
      "loss": 0.0149,
      "reward": -0.03526473790407181,
      "reward_std": 0.6603178381919861,
      "rewards/cosine_scaled_reward": -0.017632372677326202,
      "rewards/format_reward": 0.0,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2897.388916015625,
      "epoch": 0.07369323050556983,
      "grad_norm": 0.2002326250076294,
      "kl": 6.079673767089844e-05,
      "learning_rate": 8.399999999999999e-07,
      "loss": 0.055,
      "reward": 0.08917492628097534,
      "reward_std": 0.4714968279004097,
      "rewards/cosine_scaled_reward": 0.04458745941519737,
      "rewards/format_reward": 0.0,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2802.9583740234375,
      "epoch": 0.07540702656383891,
      "grad_norm": 0.14357756078243256,
      "kl": 0.00016063451766967773,
      "learning_rate": 8.599999999999999e-07,
      "loss": 0.0109,
      "reward": -0.2601087912917137,
      "reward_std": 0.5872670859098434,
      "rewards/cosine_scaled_reward": -0.13005439937114716,
      "rewards/format_reward": 0.0,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3034.8194580078125,
      "epoch": 0.07712082262210797,
      "grad_norm": 0.23196536302566528,
      "kl": 0.00012412667274475098,
      "learning_rate": 8.799999999999999e-07,
      "loss": 0.0428,
      "reward": -0.2070726901292801,
      "reward_std": 0.5877418145537376,
      "rewards/cosine_scaled_reward": -0.10353635251522064,
      "rewards/format_reward": 0.0,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2306.2083435058594,
      "epoch": 0.07883461868037704,
      "grad_norm": 0.2409650981426239,
      "kl": 0.0003217458724975586,
      "learning_rate": 9e-07,
      "loss": 0.0337,
      "reward": -0.01094321720302105,
      "reward_std": 0.6599317938089371,
      "rewards/cosine_scaled_reward": -0.00547160767018795,
      "rewards/format_reward": 0.0,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2936.1388549804688,
      "epoch": 0.0805484147386461,
      "grad_norm": 0.1777871698141098,
      "kl": 0.0003833882510662079,
      "learning_rate": 9.2e-07,
      "loss": -0.0387,
      "reward": -0.12989605404436588,
      "reward_std": 0.6336122080683708,
      "rewards/cosine_scaled_reward": -0.0649480305146426,
      "rewards/format_reward": 0.0,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2661.6806030273438,
      "epoch": 0.08226221079691516,
      "grad_norm": 0.3158990442752838,
      "kl": 0.00035144388675689697,
      "learning_rate": 9.399999999999999e-07,
      "loss": 0.1047,
      "reward": 0.12476684269495308,
      "reward_std": 0.5184459760785103,
      "rewards/cosine_scaled_reward": 0.06238342053256929,
      "rewards/format_reward": 0.0,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2053.8611450195312,
      "epoch": 0.08397600685518423,
      "grad_norm": 0.19082558155059814,
      "kl": 0.0006046295166015625,
      "learning_rate": 9.6e-07,
      "loss": -0.0144,
      "reward": 0.012501850724220276,
      "reward_std": 0.603157639503479,
      "rewards/cosine_scaled_reward": 0.006250927224755287,
      "rewards/format_reward": 0.0,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2731.6527404785156,
      "epoch": 0.0856898029134533,
      "grad_norm": 0.22663110494613647,
      "kl": 0.0009310245513916016,
      "learning_rate": 9.8e-07,
      "loss": 0.0409,
      "reward": -0.30116934701800346,
      "reward_std": 0.6284962445497513,
      "rewards/cosine_scaled_reward": -0.1505846632644534,
      "rewards/format_reward": 0.0,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2839.6944885253906,
      "epoch": 0.08740359897172237,
      "grad_norm": 0.17685562372207642,
      "kl": 0.0002518892288208008,
      "learning_rate": 1e-06,
      "loss": 0.0272,
      "reward": -0.16751686483621597,
      "reward_std": 0.5093529745936394,
      "rewards/cosine_scaled_reward": -0.08375842124223709,
      "rewards/format_reward": 0.0,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3141.4583740234375,
      "epoch": 0.08911739502999143,
      "grad_norm": 0.14409120380878448,
      "kl": 0.0003941059112548828,
      "learning_rate": 9.999890338174275e-07,
      "loss": -0.0079,
      "reward": -0.19580290652811527,
      "reward_std": 0.589723251760006,
      "rewards/cosine_scaled_reward": -0.09790145605802536,
      "rewards/format_reward": 0.0,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3054.9445190429688,
      "epoch": 0.0908311910882605,
      "grad_norm": 0.13203154504299164,
      "kl": 0.0002570152282714844,
      "learning_rate": 9.999561358041868e-07,
      "loss": 0.0455,
      "reward": -0.2164551168680191,
      "reward_std": 0.6407450139522552,
      "rewards/cosine_scaled_reward": -0.10822756588459015,
      "rewards/format_reward": 0.0,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3393.8055419921875,
      "epoch": 0.09254498714652956,
      "grad_norm": 0.11958733946084976,
      "kl": 0.0005993843078613281,
      "learning_rate": 9.999013075636804e-07,
      "loss": -0.007,
      "reward": -0.27613697946071625,
      "reward_std": 0.5631539821624756,
      "rewards/cosine_scaled_reward": -0.13806848879903555,
      "rewards/format_reward": 0.0,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3430.3055419921875,
      "epoch": 0.09425878320479864,
      "grad_norm": 0.13475047051906586,
      "kl": 0.0003286600112915039,
      "learning_rate": 9.998245517681593e-07,
      "loss": 0.0301,
      "reward": -0.2911250814795494,
      "reward_std": 0.5787934809923172,
      "rewards/cosine_scaled_reward": -0.145562544465065,
      "rewards/format_reward": 0.0,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3075.1527709960938,
      "epoch": 0.0959725792630677,
      "grad_norm": 0.14396199584007263,
      "kl": 0.0008380413055419922,
      "learning_rate": 9.997258721585931e-07,
      "loss": 0.0481,
      "reward": -0.058986596763134,
      "reward_std": 0.5793360769748688,
      "rewards/cosine_scaled_reward": -0.029493287205696106,
      "rewards/format_reward": 0.0,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3232.9722900390625,
      "epoch": 0.09768637532133675,
      "grad_norm": 0.14357316493988037,
      "kl": 0.0003731250762939453,
      "learning_rate": 9.996052735444862e-07,
      "loss": 0.0542,
      "reward": -0.08436356298625469,
      "reward_std": 0.4788799285888672,
      "rewards/cosine_scaled_reward": -0.042181783355772495,
      "rewards/format_reward": 0.0,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3087.3194580078125,
      "epoch": 0.09940017137960583,
      "grad_norm": 0.15331892669200897,
      "kl": 0.0012726783752441406,
      "learning_rate": 9.994627618036452e-07,
      "loss": 0.0529,
      "reward": -0.29565126448869705,
      "reward_std": 0.5033575221896172,
      "rewards/cosine_scaled_reward": -0.1478256327100098,
      "rewards/format_reward": 0.0,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3110.6111450195312,
      "epoch": 0.10111396743787489,
      "grad_norm": 0.14103592932224274,
      "kl": 0.0015869140625,
      "learning_rate": 9.992983438818915e-07,
      "loss": 0.0384,
      "reward": 0.018456660211086273,
      "reward_std": 0.8149007856845856,
      "rewards/cosine_scaled_reward": 0.009228323586285114,
      "rewards/format_reward": 0.0,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3305.236083984375,
      "epoch": 0.10282776349614396,
      "grad_norm": 0.12172071635723114,
      "kl": 0.00035071372985839844,
      "learning_rate": 9.991120277927223e-07,
      "loss": 0.0086,
      "reward": -0.27341870963573456,
      "reward_std": 0.7006796821951866,
      "rewards/cosine_scaled_reward": -0.13670935295522213,
      "rewards/format_reward": 0.0,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3224.0555419921875,
      "epoch": 0.10454155955441302,
      "grad_norm": 0.13248133659362793,
      "kl": 0.0005254745483398438,
      "learning_rate": 9.989038226169207e-07,
      "loss": -0.0068,
      "reward": -0.2998387850821018,
      "reward_std": 0.3452136740088463,
      "rewards/cosine_scaled_reward": -0.14991939440369606,
      "rewards/format_reward": 0.0,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2643.5833740234375,
      "epoch": 0.1062553556126821,
      "grad_norm": 0.17902526259422302,
      "kl": 0.0021648406982421875,
      "learning_rate": 9.98673738502114e-07,
      "loss": 0.057,
      "reward": 0.017559568164870143,
      "reward_std": 0.5955966338515282,
      "rewards/cosine_scaled_reward": 0.008779789437539876,
      "rewards/format_reward": 0.0,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3496.375,
      "epoch": 0.10796915167095116,
      "grad_norm": 0.1432785838842392,
      "kl": 0.00047206878662109375,
      "learning_rate": 9.98421786662277e-07,
      "loss": 0.0277,
      "reward": -0.17097678780555725,
      "reward_std": 0.6070086807012558,
      "rewards/cosine_scaled_reward": -0.08548840321600437,
      "rewards/format_reward": 0.0,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2792.486114501953,
      "epoch": 0.10968294772922023,
      "grad_norm": 0.16470499336719513,
      "kl": 0.0011835098266601562,
      "learning_rate": 9.981479793771866e-07,
      "loss": 0.0207,
      "reward": -0.26402536034584045,
      "reward_std": 0.43254173547029495,
      "rewards/cosine_scaled_reward": -0.13201268389821053,
      "rewards/format_reward": 0.0,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3128.4861450195312,
      "epoch": 0.11139674378748929,
      "grad_norm": 0.1882910132408142,
      "kl": 0.006333351135253906,
      "learning_rate": 9.97852329991824e-07,
      "loss": 0.0385,
      "reward": -0.0892822165042162,
      "reward_std": 0.6130652017891407,
      "rewards/cosine_scaled_reward": -0.04464110638946295,
      "rewards/format_reward": 0.0,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2806.75,
      "epoch": 0.11311053984575835,
      "grad_norm": 0.15443913638591766,
      "kl": 0.0003552436828613281,
      "learning_rate": 9.975348529157229e-07,
      "loss": 0.0038,
      "reward": -0.04117146506905556,
      "reward_std": 0.4872736781835556,
      "rewards/cosine_scaled_reward": -0.02058573253452778,
      "rewards/format_reward": 0.0,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3150.8194580078125,
      "epoch": 0.11482433590402742,
      "grad_norm": 0.15191471576690674,
      "kl": 0.0016102790832519531,
      "learning_rate": 9.971955636222684e-07,
      "loss": 0.0316,
      "reward": -0.23821864277124405,
      "reward_std": 0.5326030105352402,
      "rewards/cosine_scaled_reward": -0.11910932138562202,
      "rewards/format_reward": 0.0,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2845.6806030273438,
      "epoch": 0.11653813196229648,
      "grad_norm": 0.1388000249862671,
      "kl": 0.0018000602722167969,
      "learning_rate": 9.968344786479415e-07,
      "loss": 0.0376,
      "reward": -0.17579936794936657,
      "reward_std": 0.6001454517245293,
      "rewards/cosine_scaled_reward": -0.08789968676865101,
      "rewards/format_reward": 0.0,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3050.7361450195312,
      "epoch": 0.11825192802056556,
      "grad_norm": 0.13662724196910858,
      "kl": 0.0015287399291992188,
      "learning_rate": 9.964516155915151e-07,
      "loss": 0.0787,
      "reward": -0.09626813535578549,
      "reward_std": 0.6232626661658287,
      "rewards/cosine_scaled_reward": -0.04813406406901777,
      "rewards/format_reward": 0.0,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2883.9722290039062,
      "epoch": 0.11996572407883462,
      "grad_norm": 0.18917521834373474,
      "kl": 0.00302886962890625,
      "learning_rate": 9.960469931131936e-07,
      "loss": -0.0608,
      "reward": 0.05035170167684555,
      "reward_std": 0.4191203862428665,
      "rewards/cosine_scaled_reward": 0.025175858289003372,
      "rewards/format_reward": 0.0,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3137.4583740234375,
      "epoch": 0.12167952013710369,
      "grad_norm": 0.15267273783683777,
      "kl": 0.0017466545104980469,
      "learning_rate": 9.956206309337066e-07,
      "loss": 0.0362,
      "reward": -0.04426470585167408,
      "reward_std": 0.6740965843200684,
      "rewards/cosine_scaled_reward": -0.022132341749966145,
      "rewards/format_reward": 0.0,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2443.0138549804688,
      "epoch": 0.12339331619537275,
      "grad_norm": 0.16214598715305328,
      "kl": 0.003936767578125,
      "learning_rate": 9.951725498333448e-07,
      "loss": -0.0396,
      "reward": 0.09306424111127853,
      "reward_std": 0.43733419477939606,
      "rewards/cosine_scaled_reward": 0.04653212707489729,
      "rewards/format_reward": 0.0,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3163.513916015625,
      "epoch": 0.12510711225364182,
      "grad_norm": 0.23524802923202515,
      "kl": 0.018090248107910156,
      "learning_rate": 9.947027716509488e-07,
      "loss": -0.0168,
      "reward": -0.17970024980604649,
      "reward_std": 0.4914797991514206,
      "rewards/cosine_scaled_reward": -0.0898501230403781,
      "rewards/format_reward": 0.0,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2410.25,
      "epoch": 0.1268209083119109,
      "grad_norm": 0.15706373751163483,
      "kl": 0.0030879974365234375,
      "learning_rate": 9.942113192828444e-07,
      "loss": 0.0191,
      "reward": 0.2525464817881584,
      "reward_std": 0.6606673151254654,
      "rewards/cosine_scaled_reward": 0.12627324275672436,
      "rewards/format_reward": 0.0,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3146.8611450195312,
      "epoch": 0.12853470437017994,
      "grad_norm": 0.15255555510520935,
      "kl": 0.0032701492309570312,
      "learning_rate": 9.93698216681727e-07,
      "loss": 0.0281,
      "reward": -0.07365414220839739,
      "reward_std": 0.5634644776582718,
      "rewards/cosine_scaled_reward": -0.036827060393989086,
      "rewards/format_reward": 0.0,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2159.249984741211,
      "epoch": 0.13024850042844902,
      "grad_norm": 0.39581403136253357,
      "kl": 0.018310546875,
      "learning_rate": 9.931634888554935e-07,
      "loss": -0.0072,
      "reward": 0.14826004952192307,
      "reward_std": 0.6063434556126595,
      "rewards/cosine_scaled_reward": 0.07413001451641321,
      "rewards/format_reward": 0.0,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3143.9443969726562,
      "epoch": 0.1319622964867181,
      "grad_norm": 0.13312797248363495,
      "kl": 0.00225830078125,
      "learning_rate": 9.926071618660237e-07,
      "loss": 0.0387,
      "reward": 0.15560828521847725,
      "reward_std": 0.680296927690506,
      "rewards/cosine_scaled_reward": 0.07780414074659348,
      "rewards/format_reward": 0.0,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3317.5416870117188,
      "epoch": 0.13367609254498714,
      "grad_norm": 0.13495096564292908,
      "kl": 0.0019426345825195312,
      "learning_rate": 9.9202926282791e-07,
      "loss": -0.0019,
      "reward": -0.4046759568154812,
      "reward_std": 0.5655369237065315,
      "rewards/cosine_scaled_reward": -0.20233797095716,
      "rewards/format_reward": 0.0,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2373.2361755371094,
      "epoch": 0.1353898886032562,
      "grad_norm": 0.26138797402381897,
      "kl": 0.010517120361328125,
      "learning_rate": 9.91429819907136e-07,
      "loss": 0.0351,
      "reward": -0.17695464938879013,
      "reward_std": 0.34004002809524536,
      "rewards/cosine_scaled_reward": -0.08847732283174992,
      "rewards/format_reward": 0.0,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3025.5000610351562,
      "epoch": 0.13710368466152528,
      "grad_norm": 0.17277857661247253,
      "kl": 0.0012784004211425781,
      "learning_rate": 9.908088623197048e-07,
      "loss": 0.0488,
      "reward": -0.08927152771502733,
      "reward_std": 0.6381218209862709,
      "rewards/cosine_scaled_reward": -0.04463577060960233,
      "rewards/format_reward": 0.0,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3080.2777709960938,
      "epoch": 0.13881748071979436,
      "grad_norm": 0.14923037588596344,
      "kl": 0.0020084381103515625,
      "learning_rate": 9.901664203302124e-07,
      "loss": 0.0073,
      "reward": -0.27667392790317535,
      "reward_std": 0.39360568672418594,
      "rewards/cosine_scaled_reward": -0.13833696395158768,
      "rewards/format_reward": 0.0,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2893.77783203125,
      "epoch": 0.1405312767780634,
      "grad_norm": 0.3161645531654358,
      "kl": 0.011153221130371094,
      "learning_rate": 9.895025252503755e-07,
      "loss": 0.0838,
      "reward": -0.08123429818078876,
      "reward_std": 0.6654616445302963,
      "rewards/cosine_scaled_reward": -0.040617153281345963,
      "rewards/format_reward": 0.0,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2858.736083984375,
      "epoch": 0.14224507283633248,
      "grad_norm": 0.1683678925037384,
      "kl": 0.001474142074584961,
      "learning_rate": 9.888172094375033e-07,
      "loss": -0.0148,
      "reward": -0.12576034758239985,
      "reward_std": 0.6605924665927887,
      "rewards/cosine_scaled_reward": -0.06288017379119992,
      "rewards/format_reward": 0.0,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2913.3194885253906,
      "epoch": 0.14395886889460155,
      "grad_norm": 0.22510592639446259,
      "kl": 0.0032978057861328125,
      "learning_rate": 9.881105062929221e-07,
      "loss": 0.042,
      "reward": -0.05945697799324989,
      "reward_std": 0.5878739953041077,
      "rewards/cosine_scaled_reward": -0.0297284796833992,
      "rewards/format_reward": 0.0,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2990.7916870117188,
      "epoch": 0.1456726649528706,
      "grad_norm": 0.14112693071365356,
      "kl": 0.0014820098876953125,
      "learning_rate": 9.873824502603459e-07,
      "loss": 0.0518,
      "reward": -0.05626801133621484,
      "reward_std": 0.5443409904837608,
      "rewards/cosine_scaled_reward": -0.028134002874139696,
      "rewards/format_reward": 0.0,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2831.6805419921875,
      "epoch": 0.14738646101113967,
      "grad_norm": 0.17547817528247833,
      "kl": 0.00238037109375,
      "learning_rate": 9.866330768241983e-07,
      "loss": 0.0229,
      "reward": -0.25049374252557755,
      "reward_std": 0.6190591081976891,
      "rewards/cosine_scaled_reward": -0.12524686381220818,
      "rewards/format_reward": 0.0,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3439.2639770507812,
      "epoch": 0.14910025706940874,
      "grad_norm": 0.12470373511314392,
      "kl": 0.0005965232849121094,
      "learning_rate": 9.85862422507884e-07,
      "loss": 0.0309,
      "reward": -0.15761397371534258,
      "reward_std": 0.568816527724266,
      "rewards/cosine_scaled_reward": -0.07880698406370357,
      "rewards/format_reward": 0.0,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3291.013916015625,
      "epoch": 0.15081405312767782,
      "grad_norm": 0.17072485387325287,
      "kl": 0.0011734962463378906,
      "learning_rate": 9.850705248720068e-07,
      "loss": -0.0003,
      "reward": -0.31209783256053925,
      "reward_std": 0.4534567594528198,
      "rewards/cosine_scaled_reward": -0.15604891628026962,
      "rewards/format_reward": 0.0,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2711.6111450195312,
      "epoch": 0.15252784918594686,
      "grad_norm": 0.17909394204616547,
      "kl": 0.00319671630859375,
      "learning_rate": 9.8425742251254e-07,
      "loss": -0.0351,
      "reward": -0.39153438061475754,
      "reward_std": 0.44514787942171097,
      "rewards/cosine_scaled_reward": -0.19576718658208847,
      "rewards/format_reward": 0.0,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2884.3611755371094,
      "epoch": 0.15424164524421594,
      "grad_norm": 0.1545180082321167,
      "kl": 0.0027666091918945312,
      "learning_rate": 9.83423155058946e-07,
      "loss": 0.0231,
      "reward": -0.12805988639593124,
      "reward_std": 0.41310104727745056,
      "rewards/cosine_scaled_reward": -0.06402994319796562,
      "rewards/format_reward": 0.0,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2961.8194580078125,
      "epoch": 0.155955441302485,
      "grad_norm": 0.17576223611831665,
      "kl": 0.004119873046875,
      "learning_rate": 9.825677631722435e-07,
      "loss": 0.0293,
      "reward": -0.06583835743367672,
      "reward_std": 0.6373212188482285,
      "rewards/cosine_scaled_reward": -0.03291917638853192,
      "rewards/format_reward": 0.0,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2929.5277709960938,
      "epoch": 0.15766923736075408,
      "grad_norm": 0.14930115640163422,
      "kl": 0.0034623146057128906,
      "learning_rate": 9.816912885430258e-07,
      "loss": 0.0296,
      "reward": -0.18032184429466724,
      "reward_std": 0.6196585968136787,
      "rewards/cosine_scaled_reward": -0.09016093239188194,
      "rewards/format_reward": 0.0,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2748.7361450195312,
      "epoch": 0.15938303341902313,
      "grad_norm": 0.1628389209508896,
      "kl": 0.0011005401611328125,
      "learning_rate": 9.807937738894303e-07,
      "loss": 0.0544,
      "reward": -0.048349371179938316,
      "reward_std": 0.5468417555093765,
      "rewards/cosine_scaled_reward": -0.024174699559807777,
      "rewards/format_reward": 0.0,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2974.8194580078125,
      "epoch": 0.1610968294772922,
      "grad_norm": 0.17104412615299225,
      "kl": 0.0025157928466796875,
      "learning_rate": 9.798752629550546e-07,
      "loss": 0.0562,
      "reward": -0.10820803185924888,
      "reward_std": 0.5462353378534317,
      "rewards/cosine_scaled_reward": -0.05410401395056397,
      "rewards/format_reward": 0.0,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2822.3055419921875,
      "epoch": 0.16281062553556128,
      "grad_norm": 0.22087068855762482,
      "kl": 0.0032253265380859375,
      "learning_rate": 9.78935800506826e-07,
      "loss": 0.0157,
      "reward": -0.2787464428693056,
      "reward_std": 0.5101591870188713,
      "rewards/cosine_scaled_reward": -0.139373216079548,
      "rewards/format_reward": 0.0,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3177.1389770507812,
      "epoch": 0.16452442159383032,
      "grad_norm": 0.13341942429542542,
      "kl": 0.0016889572143554688,
      "learning_rate": 9.779754323328192e-07,
      "loss": 0.0599,
      "reward": 0.22422180697321892,
      "reward_std": 0.6203102543950081,
      "rewards/cosine_scaled_reward": 0.11211090348660946,
      "rewards/format_reward": 0.0,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3359.1666870117188,
      "epoch": 0.1662382176520994,
      "grad_norm": 0.17103053629398346,
      "kl": 0.0048770904541015625,
      "learning_rate": 9.769942052400235e-07,
      "loss": 0.0584,
      "reward": -0.34769631922245026,
      "reward_std": 0.5649063661694527,
      "rewards/cosine_scaled_reward": -0.17384816892445087,
      "rewards/format_reward": 0.0,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2853.8472290039062,
      "epoch": 0.16795201371036847,
      "grad_norm": 0.16162103414535522,
      "kl": 0.002391815185546875,
      "learning_rate": 9.759921670520634e-07,
      "loss": -0.0363,
      "reward": 0.04994424246251583,
      "reward_std": 0.4738911837339401,
      "rewards/cosine_scaled_reward": 0.024972120765596628,
      "rewards/format_reward": 0.0,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3113.9722290039062,
      "epoch": 0.16966580976863754,
      "grad_norm": 0.17794044315814972,
      "kl": 0.002719879150390625,
      "learning_rate": 9.749693666068663e-07,
      "loss": 0.0017,
      "reward": -0.16785867512226105,
      "reward_std": 0.5008634850382805,
      "rewards/cosine_scaled_reward": -0.08392933756113052,
      "rewards/format_reward": 0.0,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2779.9862060546875,
      "epoch": 0.1713796058269066,
      "grad_norm": 0.1735229194164276,
      "kl": 0.005786895751953125,
      "learning_rate": 9.739258537542835e-07,
      "loss": -0.0595,
      "reward": -0.15765622071921825,
      "reward_std": 0.4426313266158104,
      "rewards/cosine_scaled_reward": -0.07882811967283487,
      "rewards/format_reward": 0.0,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2904.5833129882812,
      "epoch": 0.17309340188517566,
      "grad_norm": 0.16130799055099487,
      "kl": 0.0022287368774414062,
      "learning_rate": 9.728616793536587e-07,
      "loss": -0.027,
      "reward": -0.2833556551486254,
      "reward_std": 0.41574449837207794,
      "rewards/cosine_scaled_reward": -0.14167783502489328,
      "rewards/format_reward": 0.0,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2956.9444580078125,
      "epoch": 0.17480719794344474,
      "grad_norm": 0.14904557168483734,
      "kl": 0.0023751258850097656,
      "learning_rate": 9.717768952713511e-07,
      "loss": 0.0249,
      "reward": -0.005829242058098316,
      "reward_std": 0.49208924546837807,
      "rewards/cosine_scaled_reward": -0.002914619166404009,
      "rewards/format_reward": 0.0,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3127.75,
      "epoch": 0.17652099400171378,
      "grad_norm": 0.13523682951927185,
      "kl": 0.0023593902587890625,
      "learning_rate": 9.706715543782064e-07,
      "loss": 0.0048,
      "reward": -0.16767939552664757,
      "reward_std": 0.497691310942173,
      "rewards/cosine_scaled_reward": -0.08383970521390438,
      "rewards/format_reward": 0.0,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3349.888916015625,
      "epoch": 0.17823479005998286,
      "grad_norm": 0.16127026081085205,
      "kl": 0.002029895782470703,
      "learning_rate": 9.695457105469804e-07,
      "loss": -0.0079,
      "reward": -0.4253583773970604,
      "reward_std": 0.5213425680994987,
      "rewards/cosine_scaled_reward": -0.2126791886985302,
      "rewards/format_reward": 0.0,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2762.3056030273438,
      "epoch": 0.17994858611825193,
      "grad_norm": 0.22534409165382385,
      "kl": 0.004019737243652344,
      "learning_rate": 9.683994186497132e-07,
      "loss": 0.0786,
      "reward": -0.13280940428376198,
      "reward_std": 0.6939076110720634,
      "rewards/cosine_scaled_reward": -0.06640470400452614,
      "rewards/format_reward": 0.0,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3027.013885498047,
      "epoch": 0.181662382176521,
      "grad_norm": 0.18191885948181152,
      "kl": 0.001827239990234375,
      "learning_rate": 9.672327345550543e-07,
      "loss": 0.0572,
      "reward": -0.30150486156344414,
      "reward_std": 0.5941706523299217,
      "rewards/cosine_scaled_reward": -0.15075243171304464,
      "rewards/format_reward": 0.0,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3236.4166870117188,
      "epoch": 0.18337617823479005,
      "grad_norm": 0.12520797550678253,
      "kl": 0.002315521240234375,
      "learning_rate": 9.66045715125541e-07,
      "loss": 0.0039,
      "reward": 0.061343319714069366,
      "reward_std": 0.5028644949197769,
      "rewards/cosine_scaled_reward": 0.030671661719679832,
      "rewards/format_reward": 0.0,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3337.3056030273438,
      "epoch": 0.18508997429305912,
      "grad_norm": 0.14343461394309998,
      "kl": 0.0016498565673828125,
      "learning_rate": 9.648384182148252e-07,
      "loss": 0.0438,
      "reward": -0.17464184761047363,
      "reward_std": 0.5610974803566933,
      "rewards/cosine_scaled_reward": -0.08732092566788197,
      "rewards/format_reward": 0.0,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2781.4305419921875,
      "epoch": 0.1868037703513282,
      "grad_norm": 0.1800822913646698,
      "kl": 0.0033397674560546875,
      "learning_rate": 9.636109026648554e-07,
      "loss": 0.0242,
      "reward": -0.26444700360298157,
      "reward_std": 0.5241282097995281,
      "rewards/cosine_scaled_reward": -0.13222350925207138,
      "rewards/format_reward": 0.0,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2989.3472290039062,
      "epoch": 0.18851756640959727,
      "grad_norm": 0.24952495098114014,
      "kl": 0.00222015380859375,
      "learning_rate": 9.623632283030077e-07,
      "loss": 0.1294,
      "reward": -0.038819944486021996,
      "reward_std": 0.7193348854780197,
      "rewards/cosine_scaled_reward": -0.019409974105656147,
      "rewards/format_reward": 0.0,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2775.6111450195312,
      "epoch": 0.19023136246786632,
      "grad_norm": 0.16514870524406433,
      "kl": 0.0029087066650390625,
      "learning_rate": 9.610954559391704e-07,
      "loss": 0.0565,
      "reward": -0.3495597681030631,
      "reward_std": 0.3909125030040741,
      "rewards/cosine_scaled_reward": -0.17477987939491868,
      "rewards/format_reward": 0.0,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2831.8055419921875,
      "epoch": 0.1919451585261354,
      "grad_norm": 0.13825589418411255,
      "kl": 0.0023212432861328125,
      "learning_rate": 9.598076473627796e-07,
      "loss": 0.0231,
      "reward": -0.17934568971395493,
      "reward_std": 0.5252480655908585,
      "rewards/cosine_scaled_reward": -0.08967284485697746,
      "rewards/format_reward": 0.0,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2977.8333740234375,
      "epoch": 0.19365895458440446,
      "grad_norm": 0.15359072387218475,
      "kl": 0.002410888671875,
      "learning_rate": 9.58499865339809e-07,
      "loss": 0.0053,
      "reward": 0.25896316685248166,
      "reward_std": 0.705707773566246,
      "rewards/cosine_scaled_reward": 0.12948158156359568,
      "rewards/format_reward": 0.0,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2234.2500610351562,
      "epoch": 0.1953727506426735,
      "grad_norm": 0.17650143802165985,
      "kl": 0.002643585205078125,
      "learning_rate": 9.571721736097088e-07,
      "loss": -0.0481,
      "reward": -0.20779240669799037,
      "reward_std": 0.50680061429739,
      "rewards/cosine_scaled_reward": -0.10389620521164034,
      "rewards/format_reward": 0.0,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2372.65283203125,
      "epoch": 0.19708654670094258,
      "grad_norm": 0.21576084196567535,
      "kl": 0.007110595703125,
      "learning_rate": 9.55824636882301e-07,
      "loss": 0.1072,
      "reward": 0.03794890362769365,
      "reward_std": 0.6275844648480415,
      "rewards/cosine_scaled_reward": 0.018974455073475838,
      "rewards/format_reward": 0.0,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3263.27783203125,
      "epoch": 0.19880034275921166,
      "grad_norm": 0.18672628700733185,
      "kl": 0.0029048919677734375,
      "learning_rate": 9.54457320834625e-07,
      "loss": -0.034,
      "reward": -0.3033560863696039,
      "reward_std": 0.5516846142709255,
      "rewards/cosine_scaled_reward": -0.15167804807424545,
      "rewards/format_reward": 0.0,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3024.2500610351562,
      "epoch": 0.20051413881748073,
      "grad_norm": 0.1308911144733429,
      "kl": 0.00705718994140625,
      "learning_rate": 9.530702921077358e-07,
      "loss": 0.0178,
      "reward": 0.19102132320404053,
      "reward_std": 0.7014489844441414,
      "rewards/cosine_scaled_reward": 0.09551066905260086,
      "rewards/format_reward": 0.0,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2574.013916015625,
      "epoch": 0.20222793487574978,
      "grad_norm": 0.325631320476532,
      "kl": 0.013393402099609375,
      "learning_rate": 9.516636183034564e-07,
      "loss": 0.0659,
      "reward": -0.29521266371011734,
      "reward_std": 0.5856474936008453,
      "rewards/cosine_scaled_reward": -0.1476063383743167,
      "rewards/format_reward": 0.0,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2724.3333740234375,
      "epoch": 0.20394173093401885,
      "grad_norm": 0.14827784895896912,
      "kl": 0.0027828216552734375,
      "learning_rate": 9.502373679810839e-07,
      "loss": 0.0141,
      "reward": -0.03255775198340416,
      "reward_std": 0.34701335430145264,
      "rewards/cosine_scaled_reward": -0.01627887785434723,
      "rewards/format_reward": 0.0,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2813.1111450195312,
      "epoch": 0.20565552699228792,
      "grad_norm": 0.21779808402061462,
      "kl": 0.0053081512451171875,
      "learning_rate": 9.487916106540465e-07,
      "loss": 0.0158,
      "reward": -0.19739244412630796,
      "reward_std": 0.6424184814095497,
      "rewards/cosine_scaled_reward": -0.0986962317256257,
      "rewards/format_reward": 0.0,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2874.0000610351562,
      "epoch": 0.207369323050557,
      "grad_norm": 0.2778118848800659,
      "kl": 0.0032444000244140625,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.0937,
      "reward": -0.15650038793683052,
      "reward_std": 0.5867400094866753,
      "rewards/cosine_scaled_reward": -0.07825020421296358,
      "rewards/format_reward": 0.0,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3251.1112060546875,
      "epoch": 0.20908311910882604,
      "grad_norm": 0.12883791327476501,
      "kl": 0.003650665283203125,
      "learning_rate": 9.458418577899774e-07,
      "loss": 0.02,
      "reward": -0.30216934718191624,
      "reward_std": 0.5233990028500557,
      "rewards/cosine_scaled_reward": -0.1510846719611436,
      "rewards/format_reward": 0.0,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2774.3194580078125,
      "epoch": 0.21079691516709512,
      "grad_norm": 0.20982016623020172,
      "kl": 0.002071380615234375,
      "learning_rate": 9.443380060197385e-07,
      "loss": 0.0498,
      "reward": 0.3517572022974491,
      "reward_std": 0.7633289247751236,
      "rewards/cosine_scaled_reward": 0.17587858624756336,
      "rewards/format_reward": 0.0,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3077.4166259765625,
      "epoch": 0.2125107112253642,
      "grad_norm": 0.14578428864479065,
      "kl": 0.0024623870849609375,
      "learning_rate": 9.428149347714143e-07,
      "loss": 0.002,
      "reward": -0.09189963340759277,
      "reward_std": 0.4004024267196655,
      "rewards/cosine_scaled_reward": -0.04594981297850609,
      "rewards/format_reward": 0.0,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2898.5556030273438,
      "epoch": 0.21422450728363324,
      "grad_norm": 0.19108974933624268,
      "kl": 0.0016632080078125,
      "learning_rate": 9.412727182773486e-07,
      "loss": 0.0218,
      "reward": 0.01400849362835288,
      "reward_std": 0.5958191454410553,
      "rewards/cosine_scaled_reward": 0.007004249142482877,
      "rewards/format_reward": 0.0,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3005.916748046875,
      "epoch": 0.2159383033419023,
      "grad_norm": 0.26980966329574585,
      "kl": 0.004871368408203125,
      "learning_rate": 9.397114317029974e-07,
      "loss": 0.0539,
      "reward": -0.19987820833921432,
      "reward_std": 0.5232749357819557,
      "rewards/cosine_scaled_reward": -0.09993909671902657,
      "rewards/format_reward": 0.0,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2932.5416870117188,
      "epoch": 0.21765209940017138,
      "grad_norm": 0.15654343366622925,
      "kl": 0.0043792724609375,
      "learning_rate": 9.381311511432658e-07,
      "loss": 0.0405,
      "reward": -0.17467445600777864,
      "reward_std": 0.5738040953874588,
      "rewards/cosine_scaled_reward": -0.0873372326605022,
      "rewards/format_reward": 0.0,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3097.6666870117188,
      "epoch": 0.21936589545844046,
      "grad_norm": 0.16381874680519104,
      "kl": 0.00551605224609375,
      "learning_rate": 9.36531953618799e-07,
      "loss": -0.0288,
      "reward": -0.20874720811843872,
      "reward_std": 0.5535652860999107,
      "rewards/cosine_scaled_reward": -0.10437360778450966,
      "rewards/format_reward": 0.0,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2531.861114501953,
      "epoch": 0.2210796915167095,
      "grad_norm": 0.26021480560302734,
      "kl": 0.006000518798828125,
      "learning_rate": 9.34913917072228e-07,
      "loss": 0.0459,
      "reward": -0.044261377304792404,
      "reward_std": 0.4739195331931114,
      "rewards/cosine_scaled_reward": -0.022130683064460754,
      "rewards/format_reward": 0.0,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2091.138916015625,
      "epoch": 0.22279348757497858,
      "grad_norm": 0.22645580768585205,
      "kl": 0.00482940673828125,
      "learning_rate": 9.332771203643714e-07,
      "loss": -0.0704,
      "reward": 0.38943320140242577,
      "reward_std": 0.7351026237010956,
      "rewards/cosine_scaled_reward": 0.19471661932766438,
      "rewards/format_reward": 0.0,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3235.513916015625,
      "epoch": 0.22450728363324765,
      "grad_norm": 0.14915120601654053,
      "kl": 0.003574371337890625,
      "learning_rate": 9.316216432703916e-07,
      "loss": 0.0073,
      "reward": -0.32377296313643456,
      "reward_std": 0.48132046312093735,
      "rewards/cosine_scaled_reward": -0.16188647784292698,
      "rewards/format_reward": 0.0,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3044.4166870117188,
      "epoch": 0.2262210796915167,
      "grad_norm": 0.16817504167556763,
      "kl": 0.003704071044921875,
      "learning_rate": 9.299475664759068e-07,
      "loss": 0.0174,
      "reward": -0.18535634828731418,
      "reward_std": 0.6574838161468506,
      "rewards/cosine_scaled_reward": -0.0926781720481813,
      "rewards/format_reward": 0.0,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3003.4444580078125,
      "epoch": 0.22793487574978577,
      "grad_norm": 0.15358978509902954,
      "kl": 0.0041980743408203125,
      "learning_rate": 9.282549715730579e-07,
      "loss": 0.0337,
      "reward": -0.05171632254496217,
      "reward_std": 0.5909973978996277,
      "rewards/cosine_scaled_reward": -0.025858158012852073,
      "rewards/format_reward": 0.0,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2156.4583740234375,
      "epoch": 0.22964867180805484,
      "grad_norm": 0.1683642566204071,
      "kl": 0.00467681884765625,
      "learning_rate": 9.265439410565328e-07,
      "loss": 0.0033,
      "reward": 0.009663693606853485,
      "reward_std": 0.4995303153991699,
      "rewards/cosine_scaled_reward": 0.004831850528717041,
      "rewards/format_reward": 0.0,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2540.2777709960938,
      "epoch": 0.23136246786632392,
      "grad_norm": 0.1953487992286682,
      "kl": 0.00795745849609375,
      "learning_rate": 9.248145583195447e-07,
      "loss": 0.0474,
      "reward": -0.21851413743570447,
      "reward_std": 0.5443524122238159,
      "rewards/cosine_scaled_reward": -0.10925705661065876,
      "rewards/format_reward": 0.0,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2695.0694885253906,
      "epoch": 0.23307626392459296,
      "grad_norm": 0.1705743372440338,
      "kl": 0.0045166015625,
      "learning_rate": 9.230669076497687e-07,
      "loss": 0.0191,
      "reward": 0.05242172256112099,
      "reward_std": 0.5593772605061531,
      "rewards/cosine_scaled_reward": 0.026210861280560493,
      "rewards/format_reward": 0.0,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3158.9444580078125,
      "epoch": 0.23479005998286204,
      "grad_norm": 0.17036336660385132,
      "kl": 0.00504302978515625,
      "learning_rate": 9.213010742252327e-07,
      "loss": 0.0254,
      "reward": 0.028430916368961334,
      "reward_std": 0.7066435366868973,
      "rewards/cosine_scaled_reward": 0.014215447008609772,
      "rewards/format_reward": 0.0,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3004.0416259765625,
      "epoch": 0.2365038560411311,
      "grad_norm": 0.1331450194120407,
      "kl": 0.003459930419921875,
      "learning_rate": 9.195171441101668e-07,
      "loss": -0.0176,
      "reward": -0.014733657240867615,
      "reward_std": 0.5561396405100822,
      "rewards/cosine_scaled_reward": -0.007366828620433807,
      "rewards/format_reward": 0.0,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2905.4444580078125,
      "epoch": 0.23821765209940018,
      "grad_norm": 0.17066888511180878,
      "kl": 0.00594329833984375,
      "learning_rate": 9.177152042508077e-07,
      "loss": 0.0097,
      "reward": -0.19389863312244415,
      "reward_std": 0.47480132430791855,
      "rewards/cosine_scaled_reward": -0.09694933146238327,
      "rewards/format_reward": 0.0,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2243.4722595214844,
      "epoch": 0.23993144815766923,
      "grad_norm": 0.2052508443593979,
      "kl": 0.0069751739501953125,
      "learning_rate": 9.158953424711624e-07,
      "loss": 0.0149,
      "reward": -0.17606773134320974,
      "reward_std": 0.4814153388142586,
      "rewards/cosine_scaled_reward": -0.08803386008366942,
      "rewards/format_reward": 0.0,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3140.3472290039062,
      "epoch": 0.2416452442159383,
      "grad_norm": 0.2093152105808258,
      "kl": 0.009227752685546875,
      "learning_rate": 9.140576474687263e-07,
      "loss": -0.0173,
      "reward": -0.2940823882818222,
      "reward_std": 0.46395206451416016,
      "rewards/cosine_scaled_reward": -0.1470412015914917,
      "rewards/format_reward": 0.0,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3106.0833740234375,
      "epoch": 0.24335904027420738,
      "grad_norm": 0.15536610782146454,
      "kl": 0.003704071044921875,
      "learning_rate": 9.122022088101613e-07,
      "loss": -0.0133,
      "reward": -0.12113199383020401,
      "reward_std": 0.5028039142489433,
      "rewards/cosine_scaled_reward": -0.06056600622832775,
      "rewards/format_reward": 0.0,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3045.0,
      "epoch": 0.24507283633247642,
      "grad_norm": 0.1554841846227646,
      "kl": 0.00397491455078125,
      "learning_rate": 9.103291169269299e-07,
      "loss": 0.0112,
      "reward": -0.24326159805059433,
      "reward_std": 0.545206792652607,
      "rewards/cosine_scaled_reward": -0.12163079530000687,
      "rewards/format_reward": 0.0,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3281.52783203125,
      "epoch": 0.2467866323907455,
      "grad_norm": 0.15741369128227234,
      "kl": 0.004161834716796875,
      "learning_rate": 9.084384631108882e-07,
      "loss": 0.0205,
      "reward": -0.3316431827843189,
      "reward_std": 0.5960408300161362,
      "rewards/cosine_scaled_reward": -0.16582159511744976,
      "rewards/format_reward": 0.0,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2848.000030517578,
      "epoch": 0.24850042844901457,
      "grad_norm": 0.16731388866901398,
      "kl": 0.004131317138671875,
      "learning_rate": 9.065303395098358e-07,
      "loss": 0.0076,
      "reward": -0.030747827142477036,
      "reward_std": 0.532738171517849,
      "rewards/cosine_scaled_reward": -0.015373910777270794,
      "rewards/format_reward": 0.0,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2467.6944580078125,
      "epoch": 0.25021422450728364,
      "grad_norm": 0.21354977786540985,
      "kl": 0.004360198974609375,
      "learning_rate": 9.046048391230247e-07,
      "loss": 0.0206,
      "reward": 0.0201254915446043,
      "reward_std": 0.8671004623174667,
      "rewards/cosine_scaled_reward": 0.010062748566269875,
      "rewards/format_reward": 0.0,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2855.6666870117188,
      "epoch": 0.2519280205655527,
      "grad_norm": 0.144964799284935,
      "kl": 0.004791259765625,
      "learning_rate": 9.026620557966279e-07,
      "loss": -0.0329,
      "reward": -0.2643125932663679,
      "reward_std": 0.5043439790606499,
      "rewards/cosine_scaled_reward": -0.13215629663318396,
      "rewards/format_reward": 0.0,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3096.513916015625,
      "epoch": 0.2536418166238218,
      "grad_norm": 0.14218087494373322,
      "kl": 0.003002166748046875,
      "learning_rate": 9.007020842191634e-07,
      "loss": -0.0089,
      "reward": -0.221635602414608,
      "reward_std": 0.4477159082889557,
      "rewards/cosine_scaled_reward": -0.11081778630614281,
      "rewards/format_reward": 0.0,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2720.263916015625,
      "epoch": 0.25535561268209084,
      "grad_norm": 0.17918290197849274,
      "kl": 0.004497528076171875,
      "learning_rate": 8.987250199168808e-07,
      "loss": -0.0669,
      "reward": -0.07472209073603153,
      "reward_std": 0.6641673818230629,
      "rewards/cosine_scaled_reward": -0.03736104257404804,
      "rewards/format_reward": 0.0,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2809.666717529297,
      "epoch": 0.2570694087403599,
      "grad_norm": 0.13525010645389557,
      "kl": 0.004337310791015625,
      "learning_rate": 8.967309592491052e-07,
      "loss": 0.0411,
      "reward": 0.32146409433335066,
      "reward_std": 0.6463728100061417,
      "rewards/cosine_scaled_reward": 0.16073204344138503,
      "rewards/format_reward": 0.0,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2688.6944580078125,
      "epoch": 0.258783204798629,
      "grad_norm": 0.29565665125846863,
      "kl": 0.0060577392578125,
      "learning_rate": 8.9471999940354e-07,
      "loss": -0.0998,
      "reward": -0.27624649833887815,
      "reward_std": 0.46585455536842346,
      "rewards/cosine_scaled_reward": -0.13812324171885848,
      "rewards/format_reward": 0.0,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2944.888916015625,
      "epoch": 0.26049700085689803,
      "grad_norm": 0.15996259450912476,
      "kl": 0.005001068115234375,
      "learning_rate": 8.926922383915315e-07,
      "loss": 0.0467,
      "reward": -0.09553277865052223,
      "reward_std": 0.5195184722542763,
      "rewards/cosine_scaled_reward": -0.047766391187906265,
      "rewards/format_reward": 0.0,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2880.5833740234375,
      "epoch": 0.2622107969151671,
      "grad_norm": 0.16603334248065948,
      "kl": 0.003936767578125,
      "learning_rate": 8.906477750432903e-07,
      "loss": 0.0105,
      "reward": -0.19235826842486858,
      "reward_std": 0.5736033394932747,
      "rewards/cosine_scaled_reward": -0.09617912326939404,
      "rewards/format_reward": 0.0,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2859.0972900390625,
      "epoch": 0.2639245929734362,
      "grad_norm": 0.17567692697048187,
      "kl": 0.004161834716796875,
      "learning_rate": 8.88586709003076e-07,
      "loss": -0.0056,
      "reward": -0.19033684581518173,
      "reward_std": 0.5773953720927238,
      "rewards/cosine_scaled_reward": -0.09516842663288116,
      "rewards/format_reward": 0.0,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3215.1666870117188,
      "epoch": 0.2656383890317052,
      "grad_norm": 0.14003609120845795,
      "kl": 0.004474639892578125,
      "learning_rate": 8.865091407243394e-07,
      "loss": 0.0216,
      "reward": -0.1411176547408104,
      "reward_std": 0.6216752380132675,
      "rewards/cosine_scaled_reward": -0.07055883854627609,
      "rewards/format_reward": 0.0,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2929.1250610351562,
      "epoch": 0.26735218508997427,
      "grad_norm": 0.14357882738113403,
      "kl": 0.003513336181640625,
      "learning_rate": 8.844151714648274e-07,
      "loss": -0.0355,
      "reward": -0.26859963312745094,
      "reward_std": 0.501942828297615,
      "rewards/cosine_scaled_reward": -0.13429982028901577,
      "rewards/format_reward": 0.0,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2811.6666870117188,
      "epoch": 0.26906598114824337,
      "grad_norm": 0.18389619886875153,
      "kl": 0.006900787353515625,
      "learning_rate": 8.823049032816478e-07,
      "loss": -0.049,
      "reward": 0.005984093062579632,
      "reward_std": 0.7341288924217224,
      "rewards/cosine_scaled_reward": 0.0029920428059995174,
      "rewards/format_reward": 0.0,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2909.77783203125,
      "epoch": 0.2707797772065124,
      "grad_norm": 0.13957001268863678,
      "kl": 0.0042877197265625,
      "learning_rate": 8.801784390262943e-07,
      "loss": 0.0033,
      "reward": -0.17342954874038696,
      "reward_std": 0.4903194531798363,
      "rewards/cosine_scaled_reward": -0.08671476691961288,
      "rewards/format_reward": 0.0,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3235.013916015625,
      "epoch": 0.27249357326478146,
      "grad_norm": 0.15003739297389984,
      "kl": 0.005523681640625,
      "learning_rate": 8.780358823396352e-07,
      "loss": 0.0068,
      "reward": -0.3410843312740326,
      "reward_std": 0.502905935049057,
      "rewards/cosine_scaled_reward": -0.17054216749966145,
      "rewards/format_reward": 0.0,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3160.6805419921875,
      "epoch": 0.27420736932305056,
      "grad_norm": 0.1586807668209076,
      "kl": 0.00443267822265625,
      "learning_rate": 8.758773376468604e-07,
      "loss": 0.0141,
      "reward": 0.04759278893470764,
      "reward_std": 0.6465433575212955,
      "rewards/cosine_scaled_reward": 0.02379640005528927,
      "rewards/format_reward": 0.0,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2961.3333129882812,
      "epoch": 0.2759211653813196,
      "grad_norm": 0.18396639823913574,
      "kl": 0.0078125,
      "learning_rate": 8.737029101523929e-07,
      "loss": 0.0282,
      "reward": -0.32911188155412674,
      "reward_std": 0.6032818555831909,
      "rewards/cosine_scaled_reward": -0.16455595009028912,
      "rewards/format_reward": 0.0,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2621.2222290039062,
      "epoch": 0.2776349614395887,
      "grad_norm": 0.15461236238479614,
      "kl": 0.00507354736328125,
      "learning_rate": 8.715127058347614e-07,
      "loss": -0.0194,
      "reward": -0.4356637103483081,
      "reward_std": 0.36323027312755585,
      "rewards/cosine_scaled_reward": -0.21783185191452503,
      "rewards/format_reward": 0.0,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3038.4027709960938,
      "epoch": 0.27934875749785776,
      "grad_norm": 0.12717723846435547,
      "kl": 0.005706787109375,
      "learning_rate": 8.693068314414344e-07,
      "loss": 0.0023,
      "reward": -0.04007915942929685,
      "reward_std": 0.6919823586940765,
      "rewards/cosine_scaled_reward": -0.02003958181012422,
      "rewards/format_reward": 0.0,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2678.7361755371094,
      "epoch": 0.2810625535561268,
      "grad_norm": 0.19941791892051697,
      "kl": 0.005207061767578125,
      "learning_rate": 8.670853944836176e-07,
      "loss": 0.0441,
      "reward": -0.040945328772068024,
      "reward_std": 0.5933430567383766,
      "rewards/cosine_scaled_reward": -0.020472656935453415,
      "rewards/format_reward": 0.0,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2895.0416870117188,
      "epoch": 0.2827763496143959,
      "grad_norm": 0.16098277270793915,
      "kl": 0.0064697265625,
      "learning_rate": 8.648485032310144e-07,
      "loss": 0.0293,
      "reward": -0.09013996832072735,
      "reward_std": 0.5875271111726761,
      "rewards/cosine_scaled_reward": -0.04506997298449278,
      "rewards/format_reward": 0.0,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2754.263916015625,
      "epoch": 0.28449014567266495,
      "grad_norm": 0.15243615210056305,
      "kl": 0.00676727294921875,
      "learning_rate": 8.625962667065487e-07,
      "loss": 0.0191,
      "reward": -0.0630449466407299,
      "reward_std": 0.6104780063033104,
      "rewards/cosine_scaled_reward": -0.0315224789083004,
      "rewards/format_reward": 0.0,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3024.9166870117188,
      "epoch": 0.286203941730934,
      "grad_norm": 0.14153960347175598,
      "kl": 0.0053863525390625,
      "learning_rate": 8.603287946810513e-07,
      "loss": 0.0428,
      "reward": -0.1417745603248477,
      "reward_std": 0.7242364957928658,
      "rewards/cosine_scaled_reward": -0.07088728016242385,
      "rewards/format_reward": 0.0,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3074.4306030273438,
      "epoch": 0.2879177377892031,
      "grad_norm": 0.1459978222846985,
      "kl": 0.0064067840576171875,
      "learning_rate": 8.580461976679099e-07,
      "loss": 0.0112,
      "reward": -0.01038459874689579,
      "reward_std": 0.7124739363789558,
      "rewards/cosine_scaled_reward": -0.0051923105493187904,
      "rewards/format_reward": 0.0,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3006.638916015625,
      "epoch": 0.28963153384747214,
      "grad_norm": 0.2151106894016266,
      "kl": 0.00925445556640625,
      "learning_rate": 8.557485869176825e-07,
      "loss": 0.0553,
      "reward": -0.2934446856379509,
      "reward_std": 0.5195991396903992,
      "rewards/cosine_scaled_reward": -0.14672234281897545,
      "rewards/format_reward": 0.0,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2720.8194274902344,
      "epoch": 0.2913453299057412,
      "grad_norm": 0.16352801024913788,
      "kl": 0.00519561767578125,
      "learning_rate": 8.534360744126753e-07,
      "loss": 0.061,
      "reward": 0.10783382831141353,
      "reward_std": 0.6230225935578346,
      "rewards/cosine_scaled_reward": 0.053916911128908396,
      "rewards/format_reward": 0.0,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2681.388946533203,
      "epoch": 0.2930591259640103,
      "grad_norm": 0.17118766903877258,
      "kl": 0.00641632080078125,
      "learning_rate": 8.511087728614862e-07,
      "loss": 0.026,
      "reward": -0.0785403607878834,
      "reward_std": 0.5736416950821877,
      "rewards/cosine_scaled_reward": -0.039270187029615045,
      "rewards/format_reward": 0.0,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2987.2361450195312,
      "epoch": 0.29477292202227934,
      "grad_norm": 0.15370719134807587,
      "kl": 0.0035552978515625,
      "learning_rate": 8.487667956935087e-07,
      "loss": -0.0033,
      "reward": -0.02974682953208685,
      "reward_std": 0.5253070890903473,
      "rewards/cosine_scaled_reward": -0.014873407315462828,
      "rewards/format_reward": 0.0,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2586.4027709960938,
      "epoch": 0.29648671808054844,
      "grad_norm": 0.22191597521305084,
      "kl": 0.0059356689453125,
      "learning_rate": 8.464102570534061e-07,
      "loss": -0.0092,
      "reward": -0.2831332399509847,
      "reward_std": 0.5445848181843758,
      "rewards/cosine_scaled_reward": -0.14156663417816162,
      "rewards/format_reward": 0.0,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2655.2222900390625,
      "epoch": 0.2982005141388175,
      "grad_norm": 0.22858025133609772,
      "kl": 0.01116943359375,
      "learning_rate": 8.440392717955475e-07,
      "loss": -0.0181,
      "reward": -0.2866486459970474,
      "reward_std": 0.5677091330289841,
      "rewards/cosine_scaled_reward": -0.14332432113587856,
      "rewards/format_reward": 0.0,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2976.8194580078125,
      "epoch": 0.29991431019708653,
      "grad_norm": 0.15686574578285217,
      "kl": 0.00734710693359375,
      "learning_rate": 8.416539554784089e-07,
      "loss": 0.0308,
      "reward": -0.3254437707364559,
      "reward_std": 0.5169026479125023,
      "rewards/cosine_scaled_reward": -0.16272189188748598,
      "rewards/format_reward": 0.0,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2500.736114501953,
      "epoch": 0.30162810625535563,
      "grad_norm": 0.2628232538700104,
      "kl": 0.03589630126953125,
      "learning_rate": 8.392544243589427e-07,
      "loss": 0.0534,
      "reward": -0.1589430421590805,
      "reward_std": 0.6641415655612946,
      "rewards/cosine_scaled_reward": -0.07947152107954025,
      "rewards/format_reward": 0.0,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3302.7361450195312,
      "epoch": 0.3033419023136247,
      "grad_norm": 0.13509048521518707,
      "kl": 0.003936767578125,
      "learning_rate": 8.368407953869103e-07,
      "loss": -0.0077,
      "reward": -0.3392331041395664,
      "reward_std": 0.44542837142944336,
      "rewards/cosine_scaled_reward": -0.1696165525354445,
      "rewards/format_reward": 0.0,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2946.9306030273438,
      "epoch": 0.3050556983718937,
      "grad_norm": 0.14318227767944336,
      "kl": 0.004425048828125,
      "learning_rate": 8.344131861991828e-07,
      "loss": 0.0129,
      "reward": 0.040801383554935455,
      "reward_std": 0.47273271530866623,
      "rewards/cosine_scaled_reward": 0.02040068805217743,
      "rewards/format_reward": 0.0,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3092.125,
      "epoch": 0.3067694944301628,
      "grad_norm": 0.15563301742076874,
      "kl": 0.010219573974609375,
      "learning_rate": 8.319717151140072e-07,
      "loss": 0.045,
      "reward": -0.1892098607495427,
      "reward_std": 0.5936430767178535,
      "rewards/cosine_scaled_reward": -0.09460492385551333,
      "rewards/format_reward": 0.0,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1971.5138854980469,
      "epoch": 0.30848329048843187,
      "grad_norm": 0.19795306026935577,
      "kl": 0.00974273681640625,
      "learning_rate": 8.295165011252396e-07,
      "loss": -0.0138,
      "reward": -0.11939475126564503,
      "reward_std": 0.6153334528207779,
      "rewards/cosine_scaled_reward": -0.05969736957922578,
      "rewards/format_reward": 0.0,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3067.1945190429688,
      "epoch": 0.3101970865467009,
      "grad_norm": 0.15797466039657593,
      "kl": 0.005138397216796875,
      "learning_rate": 8.270476638965461e-07,
      "loss": -0.0212,
      "reward": 0.10869292449206114,
      "reward_std": 0.6324612945318222,
      "rewards/cosine_scaled_reward": 0.054346468299627304,
      "rewards/format_reward": 0.0,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3268.486083984375,
      "epoch": 0.31191088260497,
      "grad_norm": 0.15513566136360168,
      "kl": 0.006145477294921875,
      "learning_rate": 8.245653237555705e-07,
      "loss": 0.0633,
      "reward": -0.2609336208552122,
      "reward_std": 0.49053191393613815,
      "rewards/cosine_scaled_reward": -0.13046680949628353,
      "rewards/format_reward": 0.0,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2989.84716796875,
      "epoch": 0.31362467866323906,
      "grad_norm": 0.15209534764289856,
      "kl": 0.00447845458984375,
      "learning_rate": 8.220696016880687e-07,
      "loss": 0.0061,
      "reward": -0.040327644906938076,
      "reward_std": 0.717703215777874,
      "rewards/cosine_scaled_reward": -0.02016383269801736,
      "rewards/format_reward": 0.0,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2890.4583740234375,
      "epoch": 0.31533847472150817,
      "grad_norm": 0.1359020322561264,
      "kl": 0.006763458251953125,
      "learning_rate": 8.195606193320136e-07,
      "loss": 0.0369,
      "reward": -0.22703023999929428,
      "reward_std": 0.6005472913384438,
      "rewards/cosine_scaled_reward": -0.11351512093096972,
      "rewards/format_reward": 0.0,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2926.3194580078125,
      "epoch": 0.3170522707797772,
      "grad_norm": 0.13524238765239716,
      "kl": 0.0067901611328125,
      "learning_rate": 8.170384989716657e-07,
      "loss": 0.0495,
      "reward": -0.17516471818089485,
      "reward_std": 0.5499648228287697,
      "rewards/cosine_scaled_reward": -0.08758235163986683,
      "rewards/format_reward": 0.0,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2758.0833740234375,
      "epoch": 0.31876606683804626,
      "grad_norm": 0.19634363055229187,
      "kl": 0.0046234130859375,
      "learning_rate": 8.145033635316128e-07,
      "loss": -0.0094,
      "reward": -0.17140711098909378,
      "reward_std": 0.5592127367854118,
      "rewards/cosine_scaled_reward": -0.08570355176925659,
      "rewards/format_reward": 0.0,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2399.749969482422,
      "epoch": 0.32047986289631536,
      "grad_norm": 0.14529581367969513,
      "kl": 0.00421905517578125,
      "learning_rate": 8.119553365707802e-07,
      "loss": 0.0233,
      "reward": 0.07888301834464073,
      "reward_std": 0.7940803468227386,
      "rewards/cosine_scaled_reward": 0.03944151382893324,
      "rewards/format_reward": 0.0,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3158.486083984375,
      "epoch": 0.3221936589545844,
      "grad_norm": 0.15482233464717865,
      "kl": 0.0067596435546875,
      "learning_rate": 8.093945422764069e-07,
      "loss": -0.0061,
      "reward": -0.22822286747395992,
      "reward_std": 0.48042069375514984,
      "rewards/cosine_scaled_reward": -0.1141114397905767,
      "rewards/format_reward": 0.0,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2875.52783203125,
      "epoch": 0.32390745501285345,
      "grad_norm": 0.16830354928970337,
      "kl": 0.00731658935546875,
      "learning_rate": 8.068211054579943e-07,
      "loss": -0.0214,
      "reward": -0.27129118889570236,
      "reward_std": 0.44227684289216995,
      "rewards/cosine_scaled_reward": -0.13564559258520603,
      "rewards/format_reward": 0.0,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3101.52783203125,
      "epoch": 0.32562125107112255,
      "grad_norm": 0.17314012348651886,
      "kl": 0.006618499755859375,
      "learning_rate": 8.04235151541222e-07,
      "loss": 0.0361,
      "reward": -0.29743205150589347,
      "reward_std": 0.6253781244158745,
      "rewards/cosine_scaled_reward": -0.1487160255201161,
      "rewards/format_reward": 0.0,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3005.0694580078125,
      "epoch": 0.3273350471293916,
      "grad_norm": 0.13242636620998383,
      "kl": 0.005157470703125,
      "learning_rate": 8.01636806561836e-07,
      "loss": 0.0398,
      "reward": -0.2783219777047634,
      "reward_std": 0.5744869485497475,
      "rewards/cosine_scaled_reward": -0.139160992577672,
      "rewards/format_reward": 0.0,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2682.2777709960938,
      "epoch": 0.32904884318766064,
      "grad_norm": 0.1771107167005539,
      "kl": 0.00849151611328125,
      "learning_rate": 7.990261971595048e-07,
      "loss": -0.0275,
      "reward": -0.16758478805422783,
      "reward_std": 0.5308270826935768,
      "rewards/cosine_scaled_reward": -0.08379239588975906,
      "rewards/format_reward": 0.0,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2224.2083587646484,
      "epoch": 0.33076263924592975,
      "grad_norm": 0.2606137990951538,
      "kl": 0.0111236572265625,
      "learning_rate": 7.964034505716476e-07,
      "loss": 0.0598,
      "reward": -0.1425977125763893,
      "reward_std": 0.6462048292160034,
      "rewards/cosine_scaled_reward": -0.0712988581508398,
      "rewards/format_reward": 0.0,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2366.65283203125,
      "epoch": 0.3324764353041988,
      "grad_norm": 0.20748130977153778,
      "kl": 0.00627899169921875,
      "learning_rate": 7.93768694627233e-07,
      "loss": 0.0302,
      "reward": 0.07216466031968594,
      "reward_std": 0.5604969188570976,
      "rewards/cosine_scaled_reward": 0.03608234319835901,
      "rewards/format_reward": 0.0,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3086.3889770507812,
      "epoch": 0.3341902313624679,
      "grad_norm": 0.16518257558345795,
      "kl": 0.007572174072265625,
      "learning_rate": 7.911220577405484e-07,
      "loss": 0.0403,
      "reward": -0.2750488445162773,
      "reward_std": 0.44911373406648636,
      "rewards/cosine_scaled_reward": -0.13752441480755806,
      "rewards/format_reward": 0.0,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2695.0416870117188,
      "epoch": 0.33590402742073694,
      "grad_norm": 0.14707054197788239,
      "kl": 0.0079803466796875,
      "learning_rate": 7.884636689049422e-07,
      "loss": 0.0299,
      "reward": 0.3252771459519863,
      "reward_std": 0.7292146235704422,
      "rewards/cosine_scaled_reward": 0.162638571113348,
      "rewards/format_reward": 0.0,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2711.7083129882812,
      "epoch": 0.337617823479006,
      "grad_norm": 0.19674766063690186,
      "kl": 0.00566864013671875,
      "learning_rate": 7.857936576865356e-07,
      "loss": 0.0125,
      "reward": 0.1904342882335186,
      "reward_std": 0.6823486983776093,
      "rewards/cosine_scaled_reward": 0.09521715994924307,
      "rewards/format_reward": 0.0,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2801.2361450195312,
      "epoch": 0.3393316195372751,
      "grad_norm": 0.17002622783184052,
      "kl": 0.00652313232421875,
      "learning_rate": 7.831121542179086e-07,
      "loss": 0.0551,
      "reward": -0.1881256103515625,
      "reward_std": 0.41709040850400925,
      "rewards/cosine_scaled_reward": -0.0940628070384264,
      "rewards/format_reward": 0.0,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2945.0139770507812,
      "epoch": 0.34104541559554413,
      "grad_norm": 0.17246587574481964,
      "kl": 0.006256103515625,
      "learning_rate": 7.804192891917571e-07,
      "loss": -0.0014,
      "reward": -0.20545833744108677,
      "reward_std": 0.5765868201851845,
      "rewards/cosine_scaled_reward": -0.10272916965186596,
      "rewards/format_reward": 0.0,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2718.9583435058594,
      "epoch": 0.3427592116538132,
      "grad_norm": 0.184196338057518,
      "kl": 0.008544921875,
      "learning_rate": 7.777151938545235e-07,
      "loss": 0.016,
      "reward": -0.036401793360710144,
      "reward_std": 0.7076919972896576,
      "rewards/cosine_scaled_reward": -0.01820090040564537,
      "rewards/format_reward": 0.0,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2624.166717529297,
      "epoch": 0.3444730077120823,
      "grad_norm": 0.2025025188922882,
      "kl": 0.00604248046875,
      "learning_rate": 7.75e-07,
      "loss": -0.0684,
      "reward": -0.23150286450982094,
      "reward_std": 0.4834456667304039,
      "rewards/cosine_scaled_reward": -0.11575142852962017,
      "rewards/format_reward": 0.0,
      "step": 201
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2488.5556030273438,
      "epoch": 0.3461868037703513,
      "grad_norm": 0.15225747227668762,
      "kl": 0.005893707275390625,
      "learning_rate": 7.72273839962904e-07,
      "loss": -0.0317,
      "reward": 0.06343521224334836,
      "reward_std": 0.6216820403933525,
      "rewards/cosine_scaled_reward": 0.03171759960241616,
      "rewards/format_reward": 0.0,
      "step": 202
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2638.0556030273438,
      "epoch": 0.34790059982862037,
      "grad_norm": 0.19878201186656952,
      "kl": 0.00801849365234375,
      "learning_rate": 7.695368466124296e-07,
      "loss": 0.0177,
      "reward": 0.24296507984399796,
      "reward_std": 0.7006724625825882,
      "rewards/cosine_scaled_reward": 0.12148253805935383,
      "rewards/format_reward": 0.0,
      "step": 203
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2701.2499389648438,
      "epoch": 0.3496143958868895,
      "grad_norm": 0.16115106642246246,
      "kl": 0.005344390869140625,
      "learning_rate": 7.667891533457718e-07,
      "loss": 0.0175,
      "reward": -0.01583041623234749,
      "reward_std": 0.5048926845192909,
      "rewards/cosine_scaled_reward": -0.007915209047496319,
      "rewards/format_reward": 0.0,
      "step": 204
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2820.15283203125,
      "epoch": 0.3513281919451585,
      "grad_norm": 0.1620146483182907,
      "kl": 0.00921630859375,
      "learning_rate": 7.640308940816239e-07,
      "loss": 0.0106,
      "reward": 0.10508427396416664,
      "reward_std": 0.5011924579739571,
      "rewards/cosine_scaled_reward": 0.05254213139414787,
      "rewards/format_reward": 0.0,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2766.77783203125,
      "epoch": 0.35304198800342756,
      "grad_norm": 0.17725811898708344,
      "kl": 0.0068359375,
      "learning_rate": 7.612622032536507e-07,
      "loss": 0.0292,
      "reward": -0.025651058182120323,
      "reward_std": 0.6831357106566429,
      "rewards/cosine_scaled_reward": -0.012825531885027885,
      "rewards/format_reward": 0.0,
      "step": 206
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2571.3333740234375,
      "epoch": 0.35475578406169667,
      "grad_norm": 0.2153560221195221,
      "kl": 0.00772857666015625,
      "learning_rate": 7.584832158039378e-07,
      "loss": -0.0053,
      "reward": -0.0772455558180809,
      "reward_std": 0.5703203156590462,
      "rewards/cosine_scaled_reward": -0.038622772321105,
      "rewards/format_reward": 0.0,
      "step": 207
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2722.9444580078125,
      "epoch": 0.3564695801199657,
      "grad_norm": 0.2059468924999237,
      "kl": 0.00662994384765625,
      "learning_rate": 7.556940671764124e-07,
      "loss": 0.0612,
      "reward": -0.18379988404922187,
      "reward_std": 0.6482012867927551,
      "rewards/cosine_scaled_reward": -0.09189994307234883,
      "rewards/format_reward": 0.0,
      "step": 208
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2714.9445190429688,
      "epoch": 0.3581833761782348,
      "grad_norm": 0.1764851063489914,
      "kl": 0.00818634033203125,
      "learning_rate": 7.528948933102438e-07,
      "loss": 0.0477,
      "reward": -0.011997078021522611,
      "reward_std": 0.6311939656734467,
      "rewards/cosine_scaled_reward": -0.005998534747050144,
      "rewards/format_reward": 0.0,
      "step": 209
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2458.9583740234375,
      "epoch": 0.35989717223650386,
      "grad_norm": 0.23969826102256775,
      "kl": 0.00959014892578125,
      "learning_rate": 7.500858306332172e-07,
      "loss": -0.0174,
      "reward": -0.052909690886735916,
      "reward_std": 0.6342033296823502,
      "rewards/cosine_scaled_reward": -0.026454854756593704,
      "rewards/format_reward": 0.0,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3275.9722290039062,
      "epoch": 0.3616109682947729,
      "grad_norm": 0.14406003057956696,
      "kl": 0.0072784423828125,
      "learning_rate": 7.472670160550848e-07,
      "loss": -0.0075,
      "reward": -0.4154173508286476,
      "reward_std": 0.47341830283403397,
      "rewards/cosine_scaled_reward": -0.2077086754143238,
      "rewards/format_reward": 0.0,
      "step": 211
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2626.1805419921875,
      "epoch": 0.363324764353042,
      "grad_norm": 0.1677497923374176,
      "kl": 0.007781982421875,
      "learning_rate": 7.444385869608921e-07,
      "loss": -0.0218,
      "reward": -0.05068176053464413,
      "reward_std": 0.5218113884329796,
      "rewards/cosine_scaled_reward": -0.025340883061289787,
      "rewards/format_reward": 0.0,
      "step": 212
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2894.8472290039062,
      "epoch": 0.36503856041131105,
      "grad_norm": 0.17942826449871063,
      "kl": 0.00614166259765625,
      "learning_rate": 7.416006812042827e-07,
      "loss": 0.0757,
      "reward": -0.09456230141222477,
      "reward_std": 0.6797711104154587,
      "rewards/cosine_scaled_reward": -0.0472811465151608,
      "rewards/format_reward": 0.0,
      "step": 213
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2945.8750610351562,
      "epoch": 0.3667523564695801,
      "grad_norm": 0.1967238038778305,
      "kl": 0.0100555419921875,
      "learning_rate": 7.387534371007797e-07,
      "loss": -0.0128,
      "reward": -0.10412277281284332,
      "reward_std": 0.7091450989246368,
      "rewards/cosine_scaled_reward": -0.05206138640642166,
      "rewards/format_reward": 0.0,
      "step": 214
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2597.4444580078125,
      "epoch": 0.3684661525278492,
      "grad_norm": 0.19232463836669922,
      "kl": 0.00603485107421875,
      "learning_rate": 7.358969934210438e-07,
      "loss": 0.0557,
      "reward": 0.07514850981533527,
      "reward_std": 0.5688696801662445,
      "rewards/cosine_scaled_reward": 0.03757425490766764,
      "rewards/format_reward": 0.0,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3071.6805419921875,
      "epoch": 0.37017994858611825,
      "grad_norm": 0.15334580838680267,
      "kl": 0.00914764404296875,
      "learning_rate": 7.330314893841101e-07,
      "loss": -0.0092,
      "reward": -0.3550204383209348,
      "reward_std": 0.36161456257104874,
      "rewards/cosine_scaled_reward": -0.1775102224200964,
      "rewards/format_reward": 0.0,
      "step": 216
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2696.5972595214844,
      "epoch": 0.3718937446443873,
      "grad_norm": 0.1864735186100006,
      "kl": 0.005496978759765625,
      "learning_rate": 7.301570646506027e-07,
      "loss": 0.0101,
      "reward": -0.07679219171404839,
      "reward_std": 0.6243979334831238,
      "rewards/cosine_scaled_reward": -0.03839609259739518,
      "rewards/format_reward": 0.0,
      "step": 217
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2563.0834045410156,
      "epoch": 0.3736075407026564,
      "grad_norm": 0.16931480169296265,
      "kl": 0.005645751953125,
      "learning_rate": 7.27273859315928e-07,
      "loss": -0.006,
      "reward": -0.06438015587627888,
      "reward_std": 0.4739932492375374,
      "rewards/cosine_scaled_reward": -0.032190063036978245,
      "rewards/format_reward": 0.0,
      "step": 218
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3087.9306030273438,
      "epoch": 0.37532133676092544,
      "grad_norm": 0.15486636757850647,
      "kl": 0.00830841064453125,
      "learning_rate": 7.243820139034464e-07,
      "loss": 0.034,
      "reward": -0.1913878731429577,
      "reward_std": 0.7374170869588852,
      "rewards/cosine_scaled_reward": -0.09569394029676914,
      "rewards/format_reward": 0.0,
      "step": 219
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3060.9445190429688,
      "epoch": 0.37703513281919454,
      "grad_norm": 0.1392551213502884,
      "kl": 0.00780487060546875,
      "learning_rate": 7.214816693576234e-07,
      "loss": -0.0219,
      "reward": -0.3524288460612297,
      "reward_std": 0.4711146801710129,
      "rewards/cosine_scaled_reward": -0.17621441558003426,
      "rewards/format_reward": 0.0,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3082.2083129882812,
      "epoch": 0.3787489288774636,
      "grad_norm": 0.15662223100662231,
      "kl": 0.006549835205078125,
      "learning_rate": 7.185729670371604e-07,
      "loss": 0.0127,
      "reward": -0.051485654432326555,
      "reward_std": 0.5929789990186691,
      "rewards/cosine_scaled_reward": -0.025742830068338662,
      "rewards/format_reward": 0.0,
      "step": 221
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2778.666717529297,
      "epoch": 0.38046272493573263,
      "grad_norm": 0.17736981809139252,
      "kl": 0.008647918701171875,
      "learning_rate": 7.156560487081051e-07,
      "loss": 0.0084,
      "reward": -0.13847951218485832,
      "reward_std": 0.5384139195084572,
      "rewards/cosine_scaled_reward": -0.06923975050449371,
      "rewards/format_reward": 0.0,
      "step": 222
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3187.9305419921875,
      "epoch": 0.38217652099400173,
      "grad_norm": 0.13862548768520355,
      "kl": 0.00606536865234375,
      "learning_rate": 7.127310565369415e-07,
      "loss": 0.0539,
      "reward": -0.3446214310824871,
      "reward_std": 0.46420831978321075,
      "rewards/cosine_scaled_reward": -0.1723107136785984,
      "rewards/format_reward": 0.0,
      "step": 223
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3035.513916015625,
      "epoch": 0.3838903170522708,
      "grad_norm": 0.17018531262874603,
      "kl": 0.0105438232421875,
      "learning_rate": 7.097981330836616e-07,
      "loss": 0.0259,
      "reward": -0.21396764740347862,
      "reward_std": 0.5872293263673782,
      "rewards/cosine_scaled_reward": -0.10698381997644901,
      "rewards/format_reward": 0.0,
      "step": 224
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2804.013916015625,
      "epoch": 0.3856041131105398,
      "grad_norm": 0.19500206410884857,
      "kl": 0.00768280029296875,
      "learning_rate": 7.068574212948169e-07,
      "loss": 0.1055,
      "reward": -0.22558368369936943,
      "reward_std": 0.6132937371730804,
      "rewards/cosine_scaled_reward": -0.11279183439910412,
      "rewards/format_reward": 0.0,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2667.0555725097656,
      "epoch": 0.3873179091688089,
      "grad_norm": 0.18770119547843933,
      "kl": 0.007568359375,
      "learning_rate": 7.039090644965509e-07,
      "loss": 0.0042,
      "reward": -0.05162630486302078,
      "reward_std": 0.6696203723549843,
      "rewards/cosine_scaled_reward": -0.025813143118284643,
      "rewards/format_reward": 0.0,
      "step": 226
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2906.6805419921875,
      "epoch": 0.389031705227078,
      "grad_norm": 0.16604122519493103,
      "kl": 0.0076141357421875,
      "learning_rate": 7.009532063876148e-07,
      "loss": 0.0146,
      "reward": -0.1345351382624358,
      "reward_std": 0.7545941472053528,
      "rewards/cosine_scaled_reward": -0.0672675691312179,
      "rewards/format_reward": 0.0,
      "step": 227
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2913.791748046875,
      "epoch": 0.390745501285347,
      "grad_norm": 0.36757490038871765,
      "kl": 0.006805419921875,
      "learning_rate": 6.979899910323624e-07,
      "loss": 0.0796,
      "reward": 0.061263229697942734,
      "reward_std": 0.5674895793199539,
      "rewards/cosine_scaled_reward": 0.030631612986326218,
      "rewards/format_reward": 0.0,
      "step": 228
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2708.4027709960938,
      "epoch": 0.3924592973436161,
      "grad_norm": 0.17164915800094604,
      "kl": 0.0065765380859375,
      "learning_rate": 6.950195628537299e-07,
      "loss": 0.061,
      "reward": -0.17019816813990474,
      "reward_std": 0.5833596885204315,
      "rewards/cosine_scaled_reward": -0.08509908034466207,
      "rewards/format_reward": 0.0,
      "step": 229
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2934.0555419921875,
      "epoch": 0.39417309340188517,
      "grad_norm": 0.16252191364765167,
      "kl": 0.0125732421875,
      "learning_rate": 6.920420666261961e-07,
      "loss": -0.0251,
      "reward": -0.27500685676932335,
      "reward_std": 0.4450754225254059,
      "rewards/cosine_scaled_reward": -0.13750343304127455,
      "rewards/format_reward": 0.0,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3062.4583740234375,
      "epoch": 0.39588688946015427,
      "grad_norm": 0.13106314837932587,
      "kl": 0.0096435546875,
      "learning_rate": 6.890576474687263e-07,
      "loss": 0.0074,
      "reward": -0.11593299638479948,
      "reward_std": 0.5865771174430847,
      "rewards/cosine_scaled_reward": -0.057966490276157856,
      "rewards/format_reward": 0.0,
      "step": 231
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2548.4444580078125,
      "epoch": 0.3976006855184233,
      "grad_norm": 0.15283794701099396,
      "kl": 0.00823974609375,
      "learning_rate": 6.860664508377001e-07,
      "loss": 0.0156,
      "reward": 0.012726329267024994,
      "reward_std": 0.6339813768863678,
      "rewards/cosine_scaled_reward": 0.006363175809383392,
      "rewards/format_reward": 0.0,
      "step": 232
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1752.4861602783203,
      "epoch": 0.39931448157669236,
      "grad_norm": 0.17673321068286896,
      "kl": 0.0053558349609375,
      "learning_rate": 6.83068622519821e-07,
      "loss": 0.0344,
      "reward": 0.3881940320134163,
      "reward_std": 0.6750105991959572,
      "rewards/cosine_scaled_reward": 0.19409702718257904,
      "rewards/format_reward": 0.0,
      "step": 233
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2785.6111450195312,
      "epoch": 0.40102827763496146,
      "grad_norm": 0.172316312789917,
      "kl": 0.01160430908203125,
      "learning_rate": 6.800643086250121e-07,
      "loss": -0.0154,
      "reward": -0.2950245440006256,
      "reward_std": 0.6799461841583252,
      "rewards/cosine_scaled_reward": -0.1475122720003128,
      "rewards/format_reward": 0.0,
      "step": 234
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3018.3056030273438,
      "epoch": 0.4027420736932305,
      "grad_norm": 0.15055446326732635,
      "kl": 0.0095977783203125,
      "learning_rate": 6.770536555792944e-07,
      "loss": -0.0457,
      "reward": -0.19169194623827934,
      "reward_std": 0.4096248298883438,
      "rewards/cosine_scaled_reward": -0.09584598150104284,
      "rewards/format_reward": 0.0,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3233.8889770507812,
      "epoch": 0.40445586975149955,
      "grad_norm": 0.14838387072086334,
      "kl": 0.009735107421875,
      "learning_rate": 6.740368101176495e-07,
      "loss": 0.0306,
      "reward": -0.14736445620656013,
      "reward_std": 0.6041549146175385,
      "rewards/cosine_scaled_reward": -0.07368221180513501,
      "rewards/format_reward": 0.0,
      "step": 236
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2462.7361755371094,
      "epoch": 0.40616966580976865,
      "grad_norm": 0.21186563372612,
      "kl": 0.0077667236328125,
      "learning_rate": 6.710139192768694e-07,
      "loss": 0.0142,
      "reward": -0.2296012807637453,
      "reward_std": 0.5129070654511452,
      "rewards/cosine_scaled_reward": -0.11480064131319523,
      "rewards/format_reward": 0.0,
      "step": 237
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3028.5277709960938,
      "epoch": 0.4078834618680377,
      "grad_norm": 0.15430369973182678,
      "kl": 0.00980377197265625,
      "learning_rate": 6.679851303883891e-07,
      "loss": 0.0326,
      "reward": -0.04171431064605713,
      "reward_std": 0.5160864554345608,
      "rewards/cosine_scaled_reward": -0.020857159048318863,
      "rewards/format_reward": 0.0,
      "step": 238
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2878.597198486328,
      "epoch": 0.40959725792630675,
      "grad_norm": 0.1511092185974121,
      "kl": 0.00661468505859375,
      "learning_rate": 6.649505910711058e-07,
      "loss": 0.0053,
      "reward": -0.08533445000648499,
      "reward_std": 0.48660216480493546,
      "rewards/cosine_scaled_reward": -0.04266723245382309,
      "rewards/format_reward": 0.0,
      "step": 239
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2145.986114501953,
      "epoch": 0.41131105398457585,
      "grad_norm": 0.19034837186336517,
      "kl": 0.00772857666015625,
      "learning_rate": 6.619104492241847e-07,
      "loss": 0.0412,
      "reward": 0.22470230411272496,
      "reward_std": 0.5070570334792137,
      "rewards/cosine_scaled_reward": 0.11235115380259231,
      "rewards/format_reward": 0.0,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2227.7361450195312,
      "epoch": 0.4130248500428449,
      "grad_norm": 0.1779133826494217,
      "kl": 0.008626937866210938,
      "learning_rate": 6.588648530198504e-07,
      "loss": 0.0419,
      "reward": -0.0513172447681427,
      "reward_std": 0.617318756878376,
      "rewards/cosine_scaled_reward": -0.025658607482910156,
      "rewards/format_reward": 0.0,
      "step": 241
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3336.3333740234375,
      "epoch": 0.414738646101114,
      "grad_norm": 0.1791980117559433,
      "kl": 0.00740814208984375,
      "learning_rate": 6.558139508961654e-07,
      "loss": -0.0064,
      "reward": -0.14741731621325016,
      "reward_std": 0.7067866027355194,
      "rewards/cosine_scaled_reward": -0.07370865810662508,
      "rewards/format_reward": 0.0,
      "step": 242
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3000.888916015625,
      "epoch": 0.41645244215938304,
      "grad_norm": 0.146419495344162,
      "kl": 0.006435394287109375,
      "learning_rate": 6.527578915497951e-07,
      "loss": 0.0311,
      "reward": -0.012151572853326797,
      "reward_std": 0.7768204510211945,
      "rewards/cosine_scaled_reward": -0.006075790151953697,
      "rewards/format_reward": 0.0,
      "step": 243
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2964.9444885253906,
      "epoch": 0.4181662382176521,
      "grad_norm": 0.1862625777721405,
      "kl": 0.00849151611328125,
      "learning_rate": 6.496968239287603e-07,
      "loss": 0.0372,
      "reward": -0.16059484332799911,
      "reward_std": 0.5683267489075661,
      "rewards/cosine_scaled_reward": -0.08029741793870926,
      "rewards/format_reward": 0.0,
      "step": 244
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1833.4166717529297,
      "epoch": 0.4198800342759212,
      "grad_norm": 0.3224428594112396,
      "kl": 0.0082550048828125,
      "learning_rate": 6.466308972251785e-07,
      "loss": -0.0459,
      "reward": 0.06598322093486786,
      "reward_std": 0.6559992954134941,
      "rewards/cosine_scaled_reward": 0.032991619780659676,
      "rewards/format_reward": 0.0,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2580.0694580078125,
      "epoch": 0.42159383033419023,
      "grad_norm": 0.1514631062746048,
      "kl": 0.01023101806640625,
      "learning_rate": 6.435602608679916e-07,
      "loss": 0.0527,
      "reward": -0.02805427461862564,
      "reward_std": 0.6845656186342239,
      "rewards/cosine_scaled_reward": -0.01402713917195797,
      "rewards/format_reward": 0.0,
      "step": 246
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3155.2500610351562,
      "epoch": 0.4233076263924593,
      "grad_norm": 0.1348743587732315,
      "kl": 0.01012420654296875,
      "learning_rate": 6.404850645156841e-07,
      "loss": 0.0538,
      "reward": -0.11579635553061962,
      "reward_std": 0.7224173843860626,
      "rewards/cosine_scaled_reward": -0.057898176833987236,
      "rewards/format_reward": 0.0,
      "step": 247
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2719.7222290039062,
      "epoch": 0.4250214224507284,
      "grad_norm": 0.16808690130710602,
      "kl": 0.010040283203125,
      "learning_rate": 6.374054580489873e-07,
      "loss": -0.0027,
      "reward": -0.1423700600862503,
      "reward_std": 0.41877883672714233,
      "rewards/cosine_scaled_reward": -0.07118503004312515,
      "rewards/format_reward": 0.0,
      "step": 248
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3017.5416870117188,
      "epoch": 0.4267352185089974,
      "grad_norm": 0.13636116683483124,
      "kl": 0.01111602783203125,
      "learning_rate": 6.343215915635761e-07,
      "loss": 0.0335,
      "reward": -0.16177499457262456,
      "reward_std": 0.41518206894397736,
      "rewards/cosine_scaled_reward": -0.08088749897433445,
      "rewards/format_reward": 0.0,
      "step": 249
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2664.6666870117188,
      "epoch": 0.4284490145672665,
      "grad_norm": 0.1738223433494568,
      "kl": 0.00909423828125,
      "learning_rate": 6.31233615362752e-07,
      "loss": 0.0581,
      "reward": -0.14353771694004536,
      "reward_std": 0.5664958357810974,
      "rewards/cosine_scaled_reward": -0.07176885847002268,
      "rewards/format_reward": 0.0,
      "step": 250
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2695.2916870117188,
      "epoch": 0.4301628106255356,
      "grad_norm": 0.16924519836902618,
      "kl": 0.009357452392578125,
      "learning_rate": 6.281416799501187e-07,
      "loss": -0.0019,
      "reward": 0.09459428116679192,
      "reward_std": 0.6146803349256516,
      "rewards/cosine_scaled_reward": 0.04729713872075081,
      "rewards/format_reward": 0.0,
      "step": 251
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2714.5416259765625,
      "epoch": 0.4318766066838046,
      "grad_norm": 0.19001764059066772,
      "kl": 0.0080413818359375,
      "learning_rate": 6.25045936022246e-07,
      "loss": -0.0049,
      "reward": -0.00815525185316801,
      "reward_std": 0.5676329433917999,
      "rewards/cosine_scaled_reward": -0.00407763384282589,
      "rewards/format_reward": 0.0,
      "step": 252
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2847.4166870117188,
      "epoch": 0.43359040274207367,
      "grad_norm": 0.1463892161846161,
      "kl": 0.01003265380859375,
      "learning_rate": 6.219465344613258e-07,
      "loss": -0.0337,
      "reward": 0.009062569588422775,
      "reward_std": 0.5907448679208755,
      "rewards/cosine_scaled_reward": 0.0045312922447919846,
      "rewards/format_reward": 0.0,
      "step": 253
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2974.9166870117188,
      "epoch": 0.43530419880034277,
      "grad_norm": 0.15965710580348969,
      "kl": 0.009674072265625,
      "learning_rate": 6.188436263278172e-07,
      "loss": 0.0032,
      "reward": 0.09970302879810333,
      "reward_std": 0.4728682413697243,
      "rewards/cosine_scaled_reward": 0.04985151067376137,
      "rewards/format_reward": 0.0,
      "step": 254
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2707.4027709960938,
      "epoch": 0.4370179948586118,
      "grad_norm": 0.1550796627998352,
      "kl": 0.01021575927734375,
      "learning_rate": 6.157373628530852e-07,
      "loss": 0.0007,
      "reward": -0.08888162672519684,
      "reward_std": 0.4977044016122818,
      "rewards/cosine_scaled_reward": -0.04444081336259842,
      "rewards/format_reward": 0.0,
      "step": 255
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2744.763916015625,
      "epoch": 0.4387317909168809,
      "grad_norm": 0.21653364598751068,
      "kl": 0.009979248046875,
      "learning_rate": 6.126278954320294e-07,
      "loss": 0.0062,
      "reward": -0.24449253268539906,
      "reward_std": 0.4354872331023216,
      "rewards/cosine_scaled_reward": -0.12224626448005438,
      "rewards/format_reward": 0.0,
      "step": 256
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2490.9722290039062,
      "epoch": 0.44044558697514996,
      "grad_norm": 0.1892397254705429,
      "kl": 0.00962066650390625,
      "learning_rate": 6.095153756157051e-07,
      "loss": 0.0269,
      "reward": 0.1365387246478349,
      "reward_std": 0.6730539947748184,
      "rewards/cosine_scaled_reward": 0.06826936185825616,
      "rewards/format_reward": 0.0,
      "step": 257
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2796.7916870117188,
      "epoch": 0.442159383033419,
      "grad_norm": 0.16943146288394928,
      "kl": 0.0094757080078125,
      "learning_rate": 6.06399955103937e-07,
      "loss": -0.0094,
      "reward": -0.27603928185999393,
      "reward_std": 0.517802283167839,
      "rewards/cosine_scaled_reward": -0.13801964186131954,
      "rewards/format_reward": 0.0,
      "step": 258
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2211.4305419921875,
      "epoch": 0.4438731790916881,
      "grad_norm": 0.23119370639324188,
      "kl": 0.0067596435546875,
      "learning_rate": 6.032817857379256e-07,
      "loss": 0.1106,
      "reward": -0.10240336135029793,
      "reward_std": 0.5084675773978233,
      "rewards/cosine_scaled_reward": -0.05120168812572956,
      "rewards/format_reward": 0.0,
      "step": 259
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2785.3055419921875,
      "epoch": 0.44558697514995715,
      "grad_norm": 0.21458233892917633,
      "kl": 0.00876617431640625,
      "learning_rate": 6.001610194928464e-07,
      "loss": -0.0013,
      "reward": 0.051987094804644585,
      "reward_std": 0.5341488644480705,
      "rewards/cosine_scaled_reward": 0.02599355112761259,
      "rewards/format_reward": 0.0,
      "step": 260
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2539.2083740234375,
      "epoch": 0.4473007712082262,
      "grad_norm": 0.1954081803560257,
      "kl": 0.01007843017578125,
      "learning_rate": 5.97037808470444e-07,
      "loss": 0.0316,
      "reward": 0.026572998613119125,
      "reward_std": 0.42085136845707893,
      "rewards/cosine_scaled_reward": 0.013286499306559563,
      "rewards/format_reward": 0.0,
      "step": 261
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2897.3472900390625,
      "epoch": 0.4490145672664953,
      "grad_norm": 0.1940917670726776,
      "kl": 0.01140594482421875,
      "learning_rate": 5.939123048916173e-07,
      "loss": 0.0318,
      "reward": -0.05599740147590637,
      "reward_std": 0.6964142769575119,
      "rewards/cosine_scaled_reward": -0.027998706325888634,
      "rewards/format_reward": 0.0,
      "step": 262
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2977.875,
      "epoch": 0.45072836332476435,
      "grad_norm": 0.2107793092727661,
      "kl": 0.011260986328125,
      "learning_rate": 5.907846610890011e-07,
      "loss": 0.0458,
      "reward": -0.443071685731411,
      "reward_std": 0.4884059280157089,
      "rewards/cosine_scaled_reward": -0.22153585404157639,
      "rewards/format_reward": 0.0,
      "step": 263
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2835.9166564941406,
      "epoch": 0.4524421593830334,
      "grad_norm": 0.1562654972076416,
      "kl": 0.0097503662109375,
      "learning_rate": 5.87655029499542e-07,
      "loss": 0.0527,
      "reward": -0.31120575219392776,
      "reward_std": 0.42043986171483994,
      "rewards/cosine_scaled_reward": -0.15560288727283478,
      "rewards/format_reward": 0.0,
      "step": 264
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3025.416748046875,
      "epoch": 0.4541559554413025,
      "grad_norm": 0.35641464591026306,
      "kl": 0.008880615234375,
      "learning_rate": 5.845235626570683e-07,
      "loss": 0.0023,
      "reward": 0.14537757262587547,
      "reward_std": 0.4222983121871948,
      "rewards/cosine_scaled_reward": 0.07268879748880863,
      "rewards/format_reward": 0.0,
      "step": 265
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2631.9445190429688,
      "epoch": 0.45586975149957154,
      "grad_norm": 0.1993650197982788,
      "kl": 0.006988525390625,
      "learning_rate": 5.813904131848564e-07,
      "loss": 0.02,
      "reward": -0.21425554435700178,
      "reward_std": 0.6343535855412483,
      "rewards/cosine_scaled_reward": -0.10712776239961386,
      "rewards/format_reward": 0.0,
      "step": 266
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2562.3611450195312,
      "epoch": 0.45758354755784064,
      "grad_norm": 0.22106732428073883,
      "kl": 0.01442718505859375,
      "learning_rate": 5.78255733788191e-07,
      "loss": 0.0396,
      "reward": -0.292802631855011,
      "reward_std": 0.3813341185450554,
      "rewards/cosine_scaled_reward": -0.1464013159275055,
      "rewards/format_reward": 0.0,
      "step": 267
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2572.2083740234375,
      "epoch": 0.4592973436161097,
      "grad_norm": 0.1711571365594864,
      "kl": 0.00759124755859375,
      "learning_rate": 5.751196772469237e-07,
      "loss": 0.0197,
      "reward": -0.2553995121270418,
      "reward_std": 0.5235799252986908,
      "rewards/cosine_scaled_reward": -0.1276997560635209,
      "rewards/format_reward": 0.0,
      "step": 268
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3053.2638549804688,
      "epoch": 0.46101113967437873,
      "grad_norm": 0.1386088728904724,
      "kl": 0.00843048095703125,
      "learning_rate": 5.71982396408026e-07,
      "loss": 0.0184,
      "reward": -0.17865224927663803,
      "reward_std": 0.5562375336885452,
      "rewards/cosine_scaled_reward": -0.08932612743228674,
      "rewards/format_reward": 0.0,
      "step": 269
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2723.638916015625,
      "epoch": 0.46272493573264784,
      "grad_norm": 2.8520348072052,
      "kl": 0.05461883544921875,
      "learning_rate": 5.688440441781398e-07,
      "loss": -0.0068,
      "reward": 0.27612179331481457,
      "reward_std": 0.7261447310447693,
      "rewards/cosine_scaled_reward": 0.13806088734418154,
      "rewards/format_reward": 0.0,
      "step": 270
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2795.666748046875,
      "epoch": 0.4644387317909169,
      "grad_norm": 0.1723846048116684,
      "kl": 0.01483154296875,
      "learning_rate": 5.657047735161255e-07,
      "loss": 0.0373,
      "reward": -0.03490264154970646,
      "reward_std": 0.6204687505960464,
      "rewards/cosine_scaled_reward": -0.017451307736337185,
      "rewards/format_reward": 0.0,
      "step": 271
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2526.1944580078125,
      "epoch": 0.4661525278491859,
      "grad_norm": 0.2960320711135864,
      "kl": 0.0132598876953125,
      "learning_rate": 5.625647374256061e-07,
      "loss": 0.0815,
      "reward": 0.11341174505650997,
      "reward_std": 0.5083474740386009,
      "rewards/cosine_scaled_reward": 0.05670587276108563,
      "rewards/format_reward": 0.0,
      "step": 272
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2640.1944580078125,
      "epoch": 0.46786632390745503,
      "grad_norm": 0.17620131373405457,
      "kl": 0.009765625,
      "learning_rate": 5.594240889475106e-07,
      "loss": 0.0112,
      "reward": 0.11540575325489044,
      "reward_std": 0.5552510917186737,
      "rewards/cosine_scaled_reward": 0.05770287476480007,
      "rewards/format_reward": 0.0,
      "step": 273
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3208.8333129882812,
      "epoch": 0.4695801199657241,
      "grad_norm": 0.15742135047912598,
      "kl": 0.01263427734375,
      "learning_rate": 5.562829811526154e-07,
      "loss": -0.0201,
      "reward": -0.4686981365084648,
      "reward_std": 0.3511890172958374,
      "rewards/cosine_scaled_reward": -0.2343490682542324,
      "rewards/format_reward": 0.0,
      "step": 274
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2885.9306030273438,
      "epoch": 0.4712939160239931,
      "grad_norm": 0.18644562363624573,
      "kl": 0.01094818115234375,
      "learning_rate": 5.531415671340826e-07,
      "loss": 0.0224,
      "reward": -0.2238161340355873,
      "reward_std": 0.5779955387115479,
      "rewards/cosine_scaled_reward": -0.11190806701779366,
      "rewards/format_reward": 0.0,
      "step": 275
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2641.7222290039062,
      "epoch": 0.4730077120822622,
      "grad_norm": 0.2060326635837555,
      "kl": 0.009002685546875,
      "learning_rate": 5.5e-07,
      "loss": 0.0921,
      "reward": -0.10621737875044346,
      "reward_std": 0.572068989276886,
      "rewards/cosine_scaled_reward": -0.053108690306544304,
      "rewards/format_reward": 0.0,
      "step": 276
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3040.3193969726562,
      "epoch": 0.47472150814053127,
      "grad_norm": 0.15230870246887207,
      "kl": 0.009868621826171875,
      "learning_rate": 5.468584328659172e-07,
      "loss": 0.0243,
      "reward": -0.27920062592602335,
      "reward_std": 0.4912775382399559,
      "rewards/cosine_scaled_reward": -0.13960031296301167,
      "rewards/format_reward": 0.0,
      "step": 277
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2733.8472900390625,
      "epoch": 0.47643530419880037,
      "grad_norm": 0.16092219948768616,
      "kl": 0.0078125,
      "learning_rate": 5.437170188473847e-07,
      "loss": 0.0214,
      "reward": 0.1801936998963356,
      "reward_std": 0.7019116431474686,
      "rewards/cosine_scaled_reward": 0.09009685181081295,
      "rewards/format_reward": 0.0,
      "step": 278
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3269.8194580078125,
      "epoch": 0.4781491002570694,
      "grad_norm": 0.13919058442115784,
      "kl": 0.01202392578125,
      "learning_rate": 5.405759110524894e-07,
      "loss": 0.0359,
      "reward": -0.2203904101625085,
      "reward_std": 0.5241215899586678,
      "rewards/cosine_scaled_reward": -0.11019521998241544,
      "rewards/format_reward": 0.0,
      "step": 279
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2767.611083984375,
      "epoch": 0.47986289631533846,
      "grad_norm": 0.17986002564430237,
      "kl": 0.01087188720703125,
      "learning_rate": 5.37435262574394e-07,
      "loss": 0.0177,
      "reward": 0.20984390750527382,
      "reward_std": 0.6492117866873741,
      "rewards/cosine_scaled_reward": 0.10492195282131433,
      "rewards/format_reward": 0.0,
      "step": 280
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2349.902801513672,
      "epoch": 0.48157669237360756,
      "grad_norm": 0.20755039155483246,
      "kl": 0.0091094970703125,
      "learning_rate": 5.342952264838747e-07,
      "loss": 0.0718,
      "reward": 0.09011890506371856,
      "reward_std": 0.755554661154747,
      "rewards/cosine_scaled_reward": 0.04505945247365162,
      "rewards/format_reward": 0.0,
      "step": 281
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2690.3333740234375,
      "epoch": 0.4832904884318766,
      "grad_norm": 0.16312456130981445,
      "kl": 0.0081787109375,
      "learning_rate": 5.311559558218603e-07,
      "loss": -0.0225,
      "reward": -0.1038619177415967,
      "reward_std": 0.6092793643474579,
      "rewards/cosine_scaled_reward": -0.05193095514550805,
      "rewards/format_reward": 0.0,
      "step": 282
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2992.4443969726562,
      "epoch": 0.48500428449014565,
      "grad_norm": 0.14668744802474976,
      "kl": 0.009429931640625,
      "learning_rate": 5.28017603591974e-07,
      "loss": 0.0094,
      "reward": -0.18053901614621282,
      "reward_std": 0.5393766239285469,
      "rewards/cosine_scaled_reward": -0.09026950527913868,
      "rewards/format_reward": 0.0,
      "step": 283
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2731.1666870117188,
      "epoch": 0.48671808054841476,
      "grad_norm": 0.18690791726112366,
      "kl": 0.01348876953125,
      "learning_rate": 5.248803227530763e-07,
      "loss": 0.0274,
      "reward": 0.05301067978143692,
      "reward_std": 0.8040451109409332,
      "rewards/cosine_scaled_reward": 0.026505338959395885,
      "rewards/format_reward": 0.0,
      "step": 284
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3109.9722290039062,
      "epoch": 0.4884318766066838,
      "grad_norm": 0.13096371293067932,
      "kl": 0.0122222900390625,
      "learning_rate": 5.21744266211809e-07,
      "loss": -0.0039,
      "reward": -0.10303456708788872,
      "reward_std": 0.6089868098497391,
      "rewards/cosine_scaled_reward": -0.05151727236807346,
      "rewards/format_reward": 0.0,
      "step": 285
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2716.9166259765625,
      "epoch": 0.49014567266495285,
      "grad_norm": 0.21381936967372894,
      "kl": 0.00853729248046875,
      "learning_rate": 5.186095868151436e-07,
      "loss": -0.0087,
      "reward": -0.08554558828473091,
      "reward_std": 0.6172359138727188,
      "rewards/cosine_scaled_reward": -0.042772796005010605,
      "rewards/format_reward": 0.0,
      "step": 286
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3128.3195190429688,
      "epoch": 0.49185946872322195,
      "grad_norm": 0.20883357524871826,
      "kl": 0.0149688720703125,
      "learning_rate": 5.154764373429315e-07,
      "loss": 0.0885,
      "reward": -0.16452566534280777,
      "reward_std": 0.6313002184033394,
      "rewards/cosine_scaled_reward": -0.08226283825933933,
      "rewards/format_reward": 0.0,
      "step": 287
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2783.2777709960938,
      "epoch": 0.493573264781491,
      "grad_norm": 0.18023322522640228,
      "kl": 0.016357421875,
      "learning_rate": 5.123449705004581e-07,
      "loss": 0.0778,
      "reward": -0.29250151151791215,
      "reward_std": 0.5800458639860153,
      "rewards/cosine_scaled_reward": -0.14625075762160122,
      "rewards/format_reward": 0.0,
      "step": 288
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2974.1944274902344,
      "epoch": 0.4952870608397601,
      "grad_norm": 0.226176917552948,
      "kl": 0.01458740234375,
      "learning_rate": 5.09215338910999e-07,
      "loss": 0.0448,
      "reward": -0.33544909581542015,
      "reward_std": 0.5062796398997307,
      "rewards/cosine_scaled_reward": -0.16772454418241978,
      "rewards/format_reward": 0.0,
      "step": 289
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2436.1944274902344,
      "epoch": 0.49700085689802914,
      "grad_norm": 0.1747155487537384,
      "kl": 0.012481689453125,
      "learning_rate": 5.060876951083828e-07,
      "loss": -0.0449,
      "reward": -0.14955687522888184,
      "reward_std": 0.5533142015337944,
      "rewards/cosine_scaled_reward": -0.07477843947708607,
      "rewards/format_reward": 0.0,
      "step": 290
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3039.3611450195312,
      "epoch": 0.4987146529562982,
      "grad_norm": 0.1754036843776703,
      "kl": 0.01313018798828125,
      "learning_rate": 5.02962191529556e-07,
      "loss": 0.0491,
      "reward": -0.44222037494182587,
      "reward_std": 0.49202967807650566,
      "rewards/cosine_scaled_reward": -0.22111019119620323,
      "rewards/format_reward": 0.0,
      "step": 291
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2386.4722595214844,
      "epoch": 0.5004284490145673,
      "grad_norm": 0.23209446668624878,
      "kl": 0.0133209228515625,
      "learning_rate": 4.998389805071536e-07,
      "loss": 0.1035,
      "reward": 0.11830113036558032,
      "reward_std": 0.7409112825989723,
      "rewards/cosine_scaled_reward": 0.059150564251467586,
      "rewards/format_reward": 0.0,
      "step": 292
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2481.1666870117188,
      "epoch": 0.5021422450728363,
      "grad_norm": 0.17551322281360626,
      "kl": 0.0098724365234375,
      "learning_rate": 4.967182142620745e-07,
      "loss": 0.0432,
      "reward": -0.1329963468015194,
      "reward_std": 0.5577030703425407,
      "rewards/cosine_scaled_reward": -0.06649817898869514,
      "rewards/format_reward": 0.0,
      "step": 293
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2711.5000610351562,
      "epoch": 0.5038560411311054,
      "grad_norm": 0.18221919238567352,
      "kl": 0.01308441162109375,
      "learning_rate": 4.93600044896063e-07,
      "loss": 0.0587,
      "reward": -0.21110662072896957,
      "reward_std": 0.5812349170446396,
      "rewards/cosine_scaled_reward": -0.10555331036448479,
      "rewards/format_reward": 0.0,
      "step": 294
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2619.5972290039062,
      "epoch": 0.5055698371893744,
      "grad_norm": 0.18888363242149353,
      "kl": 0.0113067626953125,
      "learning_rate": 4.904846243842949e-07,
      "loss": -0.0068,
      "reward": 0.10603267699480057,
      "reward_std": 0.6550966873764992,
      "rewards/cosine_scaled_reward": 0.053016334772109985,
      "rewards/format_reward": 0.0,
      "step": 295
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2851.4583129882812,
      "epoch": 0.5072836332476436,
      "grad_norm": 0.15981672704219818,
      "kl": 0.0122833251953125,
      "learning_rate": 4.873721045679706e-07,
      "loss": 0.0399,
      "reward": 0.07413195073604584,
      "reward_std": 0.6663401573896408,
      "rewards/cosine_scaled_reward": 0.03706597909331322,
      "rewards/format_reward": 0.0,
      "step": 296
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2682.5833435058594,
      "epoch": 0.5089974293059126,
      "grad_norm": 0.18823187053203583,
      "kl": 0.0098419189453125,
      "learning_rate": 4.842626371469149e-07,
      "loss": 0.0705,
      "reward": -0.035793907940387726,
      "reward_std": 0.5416731983423233,
      "rewards/cosine_scaled_reward": -0.017896955832839012,
      "rewards/format_reward": 0.0,
      "step": 297
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2439.0556030273438,
      "epoch": 0.5107112253641817,
      "grad_norm": 0.17318564653396606,
      "kl": 0.012298583984375,
      "learning_rate": 4.811563736721829e-07,
      "loss": 0.0134,
      "reward": 0.026430480182170868,
      "reward_std": 0.5753844156861305,
      "rewards/cosine_scaled_reward": 0.013215240091085434,
      "rewards/format_reward": 0.0,
      "step": 298
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2576.9027709960938,
      "epoch": 0.5124250214224507,
      "grad_norm": 0.21229924261569977,
      "kl": 0.0172576904296875,
      "learning_rate": 4.780534655386743e-07,
      "loss": 0.0552,
      "reward": 0.3652267027646303,
      "reward_std": 0.6922546178102493,
      "rewards/cosine_scaled_reward": 0.18261335138231516,
      "rewards/format_reward": 0.0,
      "step": 299
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2756.9305419921875,
      "epoch": 0.5141388174807198,
      "grad_norm": 0.19316470623016357,
      "kl": 0.011383056640625,
      "learning_rate": 4.749540639777539e-07,
      "loss": 0.0299,
      "reward": 0.22619394585490227,
      "reward_std": 0.4907483011484146,
      "rewards/cosine_scaled_reward": 0.11309697106480598,
      "rewards/format_reward": 0.0,
      "step": 300
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2826.9306640625,
      "epoch": 0.5158526135389888,
      "grad_norm": 0.233236625790596,
      "kl": 0.01507568359375,
      "learning_rate": 4.7185832004988133e-07,
      "loss": 0.0851,
      "reward": -0.13008400797843933,
      "reward_std": 0.7507277429103851,
      "rewards/cosine_scaled_reward": -0.06504200212657452,
      "rewards/format_reward": 0.0,
      "step": 301
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3145.986083984375,
      "epoch": 0.517566409597258,
      "grad_norm": 0.1533445417881012,
      "kl": 0.0132293701171875,
      "learning_rate": 4.68766384637248e-07,
      "loss": 0.0074,
      "reward": -0.00015814602375030518,
      "reward_std": 0.7809525281190872,
      "rewards/cosine_scaled_reward": -7.90674239397049e-05,
      "rewards/format_reward": 0.0,
      "step": 302
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3029.40283203125,
      "epoch": 0.519280205655527,
      "grad_norm": 0.1652766764163971,
      "kl": 0.0153656005859375,
      "learning_rate": 4.656784084364238e-07,
      "loss": 0.0139,
      "reward": -0.06143874488770962,
      "reward_std": 0.7485700696706772,
      "rewards/cosine_scaled_reward": -0.030719374306499958,
      "rewards/format_reward": 0.0,
      "step": 303
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2729.52783203125,
      "epoch": 0.5209940017137961,
      "grad_norm": 0.18553942441940308,
      "kl": 0.01348876953125,
      "learning_rate": 4.6259454195101267e-07,
      "loss": -0.0083,
      "reward": -0.042102924548089504,
      "reward_std": 0.5168112218379974,
      "rewards/cosine_scaled_reward": -0.02105145249515772,
      "rewards/format_reward": 0.0,
      "step": 304
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2928.9444580078125,
      "epoch": 0.5227077977720651,
      "grad_norm": 0.24608513712882996,
      "kl": 0.01422119140625,
      "learning_rate": 4.59514935484316e-07,
      "loss": 0.0859,
      "reward": -0.11776435747742653,
      "reward_std": 0.6116138771176338,
      "rewards/cosine_scaled_reward": -0.058882176876068115,
      "rewards/format_reward": 0.0,
      "step": 305
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2568.7222290039062,
      "epoch": 0.5244215938303342,
      "grad_norm": 0.1760726422071457,
      "kl": 0.0126800537109375,
      "learning_rate": 4.5643973913200837e-07,
      "loss": 0.0106,
      "reward": -0.2896502474322915,
      "reward_std": 0.542039155960083,
      "rewards/cosine_scaled_reward": -0.1448251255787909,
      "rewards/format_reward": 0.0,
      "step": 306
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2816.7916259765625,
      "epoch": 0.5261353898886033,
      "grad_norm": 0.20767201483249664,
      "kl": 0.015869140625,
      "learning_rate": 4.5336910277482155e-07,
      "loss": 0.0605,
      "reward": -0.07595526240766048,
      "reward_std": 0.7446087747812271,
      "rewards/cosine_scaled_reward": -0.03797762934118509,
      "rewards/format_reward": 0.0,
      "step": 307
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2619.0,
      "epoch": 0.5278491859468724,
      "grad_norm": 0.18840792775154114,
      "kl": 0.01406097412109375,
      "learning_rate": 4.503031760712397e-07,
      "loss": 0.0317,
      "reward": -0.13000392355024815,
      "reward_std": 0.5407935008406639,
      "rewards/cosine_scaled_reward": -0.0650019682943821,
      "rewards/format_reward": 0.0,
      "step": 308
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2706.4583740234375,
      "epoch": 0.5295629820051414,
      "grad_norm": 0.20385313034057617,
      "kl": 0.017333984375,
      "learning_rate": 4.4724210845020494e-07,
      "loss": 0.0402,
      "reward": 0.09998160088434815,
      "reward_std": 0.6437982618808746,
      "rewards/cosine_scaled_reward": 0.049990794621407986,
      "rewards/format_reward": 0.0,
      "step": 309
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3076.0694580078125,
      "epoch": 0.5312767780634104,
      "grad_norm": 0.15351690351963043,
      "kl": 0.0145416259765625,
      "learning_rate": 4.441860491038345e-07,
      "loss": 0.0112,
      "reward": -0.1288044311950216,
      "reward_std": 0.5119795873761177,
      "rewards/cosine_scaled_reward": -0.0644022131091333,
      "rewards/format_reward": 0.0,
      "step": 310
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3078.3333129882812,
      "epoch": 0.5329905741216795,
      "grad_norm": 0.21041399240493774,
      "kl": 0.0153045654296875,
      "learning_rate": 4.4113514698014953e-07,
      "loss": 0.0569,
      "reward": -0.24592324905097485,
      "reward_std": 0.5915715545415878,
      "rewards/cosine_scaled_reward": -0.12296162731945515,
      "rewards/format_reward": 0.0,
      "step": 311
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2794.888916015625,
      "epoch": 0.5347043701799485,
      "grad_norm": 0.3006104528903961,
      "kl": 0.0116729736328125,
      "learning_rate": 4.3808955077581546e-07,
      "loss": 0.1717,
      "reward": 0.2339099831879139,
      "reward_std": 0.6782252490520477,
      "rewards/cosine_scaled_reward": 0.1169549860060215,
      "rewards/format_reward": 0.0,
      "step": 312
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2454.65283203125,
      "epoch": 0.5364181662382177,
      "grad_norm": 0.213435098528862,
      "kl": 0.0183868408203125,
      "learning_rate": 4.350494089288943e-07,
      "loss": -0.0051,
      "reward": -0.29112886637449265,
      "reward_std": 0.48665956407785416,
      "rewards/cosine_scaled_reward": -0.14556444063782692,
      "rewards/format_reward": 0.0,
      "step": 313
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2845.3194580078125,
      "epoch": 0.5381319622964867,
      "grad_norm": 0.23916800320148468,
      "kl": 0.0161285400390625,
      "learning_rate": 4.3201486961161093e-07,
      "loss": 0.0824,
      "reward": -0.16251583769917488,
      "reward_std": 0.4937269687652588,
      "rewards/cosine_scaled_reward": -0.08125792350620031,
      "rewards/format_reward": 0.0,
      "step": 314
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2697.9583740234375,
      "epoch": 0.5398457583547558,
      "grad_norm": 0.19882318377494812,
      "kl": 0.018157958984375,
      "learning_rate": 4.2898608072313045e-07,
      "loss": 0.0178,
      "reward": -0.25365344155579805,
      "reward_std": 0.5236896127462387,
      "rewards/cosine_scaled_reward": -0.12682672249502502,
      "rewards/format_reward": 0.0,
      "step": 315
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2544.9861450195312,
      "epoch": 0.5415595544130248,
      "grad_norm": 0.20584873855113983,
      "kl": 0.014862060546875,
      "learning_rate": 4.2596318988235037e-07,
      "loss": 0.0389,
      "reward": -0.09484067000448704,
      "reward_std": 0.6149067878723145,
      "rewards/cosine_scaled_reward": -0.04742033500224352,
      "rewards/format_reward": 0.0,
      "step": 316
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2950.8333740234375,
      "epoch": 0.5432733504712939,
      "grad_norm": 0.15868861973285675,
      "kl": 0.018310546875,
      "learning_rate": 4.2294634442070553e-07,
      "loss": 0.0378,
      "reward": -0.39894504845142365,
      "reward_std": 0.4898769110441208,
      "rewards/cosine_scaled_reward": -0.19947252236306667,
      "rewards/format_reward": 0.0,
      "step": 317
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2152.3195190429688,
      "epoch": 0.5449871465295629,
      "grad_norm": 0.1994917094707489,
      "kl": 0.0172882080078125,
      "learning_rate": 4.1993569137498776e-07,
      "loss": -0.0091,
      "reward": 0.24264823482371867,
      "reward_std": 0.6610805988311768,
      "rewards/cosine_scaled_reward": 0.12132412963546813,
      "rewards/format_reward": 0.0,
      "step": 318
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2402.5556030273438,
      "epoch": 0.5467009425878321,
      "grad_norm": 0.2102198302745819,
      "kl": 0.01351165771484375,
      "learning_rate": 4.1693137748017915e-07,
      "loss": -0.0681,
      "reward": 0.05987721309065819,
      "reward_std": 0.5766515731811523,
      "rewards/cosine_scaled_reward": 0.029938601423054934,
      "rewards/format_reward": 0.0,
      "step": 319
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2677.4027709960938,
      "epoch": 0.5484147386461011,
      "grad_norm": 0.2358679324388504,
      "kl": 0.01690673828125,
      "learning_rate": 4.1393354916230005e-07,
      "loss": 0.0956,
      "reward": -0.05587568995542824,
      "reward_std": 0.6320854872465134,
      "rewards/cosine_scaled_reward": -0.02793784497771412,
      "rewards/format_reward": 0.0,
      "step": 320
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3042.4722900390625,
      "epoch": 0.5501285347043702,
      "grad_norm": 0.18476322293281555,
      "kl": 0.017547607421875,
      "learning_rate": 4.1094235253127374e-07,
      "loss": 0.0512,
      "reward": -0.2119649334345013,
      "reward_std": 0.585174448788166,
      "rewards/cosine_scaled_reward": -0.1059824712574482,
      "rewards/format_reward": 0.0,
      "step": 321
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2080.375030517578,
      "epoch": 0.5518423307626392,
      "grad_norm": 0.18924832344055176,
      "kl": 0.0111083984375,
      "learning_rate": 4.079579333738039e-07,
      "loss": 0.0098,
      "reward": 0.3428979776799679,
      "reward_std": 0.7396816238760948,
      "rewards/cosine_scaled_reward": 0.1714489795267582,
      "rewards/format_reward": 0.0,
      "step": 322
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2770.7916870117188,
      "epoch": 0.5535561268209083,
      "grad_norm": 0.17449912428855896,
      "kl": 0.0141143798828125,
      "learning_rate": 4.0498043714627006e-07,
      "loss": 0.0149,
      "reward": -0.15011528879404068,
      "reward_std": 0.5199657753109932,
      "rewards/cosine_scaled_reward": -0.07505764067173004,
      "rewards/format_reward": 0.0,
      "step": 323
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2524.4305725097656,
      "epoch": 0.5552699228791774,
      "grad_norm": 0.25161027908325195,
      "kl": 0.01303863525390625,
      "learning_rate": 4.020100089676376e-07,
      "loss": 0.1119,
      "reward": 0.2225971333682537,
      "reward_std": 0.7053848057985306,
      "rewards/cosine_scaled_reward": 0.11129856202751398,
      "rewards/format_reward": 0.0,
      "step": 324
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2823.5694580078125,
      "epoch": 0.5569837189374465,
      "grad_norm": 0.17407697439193726,
      "kl": 0.016265869140625,
      "learning_rate": 3.9904679361238526e-07,
      "loss": -0.0328,
      "reward": -0.11739783291704953,
      "reward_std": 0.6684166565537453,
      "rewards/cosine_scaled_reward": -0.058698914712294936,
      "rewards/format_reward": 0.0,
      "step": 325
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2481.6944580078125,
      "epoch": 0.5586975149957155,
      "grad_norm": 0.16408374905586243,
      "kl": 0.01628875732421875,
      "learning_rate": 3.9609093550344907e-07,
      "loss": 0.0145,
      "reward": 0.05000840872526169,
      "reward_std": 0.4738306663930416,
      "rewards/cosine_scaled_reward": 0.025004200637340546,
      "rewards/format_reward": 0.0,
      "step": 326
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2850.8611450195312,
      "epoch": 0.5604113110539846,
      "grad_norm": 0.1830449402332306,
      "kl": 0.0183563232421875,
      "learning_rate": 3.931425787051832e-07,
      "loss": 0.054,
      "reward": -0.26191626861691475,
      "reward_std": 0.4200581759214401,
      "rewards/cosine_scaled_reward": -0.1309581445530057,
      "rewards/format_reward": 0.0,
      "step": 327
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2681.875030517578,
      "epoch": 0.5621251071122536,
      "grad_norm": 0.3444949984550476,
      "kl": 0.031097412109375,
      "learning_rate": 3.902018669163384e-07,
      "loss": 0.0002,
      "reward": 0.058326710015535355,
      "reward_std": 0.5914809927344322,
      "rewards/cosine_scaled_reward": 0.029163353145122528,
      "rewards/format_reward": 0.0,
      "step": 328
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2444.1805419921875,
      "epoch": 0.5638389031705227,
      "grad_norm": 0.20234812796115875,
      "kl": 0.0186004638671875,
      "learning_rate": 3.872689434630585e-07,
      "loss": 0.0297,
      "reward": 0.015948079526424408,
      "reward_std": 0.5476803705096245,
      "rewards/cosine_scaled_reward": 0.00797403953038156,
      "rewards/format_reward": 0.0,
      "step": 329
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2751.6666870117188,
      "epoch": 0.5655526992287918,
      "grad_norm": 0.20875848829746246,
      "kl": 0.0170745849609375,
      "learning_rate": 3.843439512918949e-07,
      "loss": 0.0404,
      "reward": -0.1900151213631034,
      "reward_std": 0.552287369966507,
      "rewards/cosine_scaled_reward": -0.09500756207853556,
      "rewards/format_reward": 0.0,
      "step": 330
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2700.7222900390625,
      "epoch": 0.5672664952870609,
      "grad_norm": 0.17264467477798462,
      "kl": 0.0172119140625,
      "learning_rate": 3.8142703296283953e-07,
      "loss": 0.0526,
      "reward": 0.03160311561077833,
      "reward_std": 0.5627969726920128,
      "rewards/cosine_scaled_reward": 0.015801557805389166,
      "rewards/format_reward": 0.0,
      "step": 331
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2822.8472900390625,
      "epoch": 0.5689802913453299,
      "grad_norm": 0.27976417541503906,
      "kl": 0.023895263671875,
      "learning_rate": 3.785183306423767e-07,
      "loss": 0.0355,
      "reward": 0.02845914661884308,
      "reward_std": 0.5001804158091545,
      "rewards/cosine_scaled_reward": 0.014229563996195793,
      "rewards/format_reward": 0.0,
      "step": 332
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2853.3333740234375,
      "epoch": 0.570694087403599,
      "grad_norm": 0.1514306664466858,
      "kl": 0.017669677734375,
      "learning_rate": 3.7561798609655373e-07,
      "loss": -0.0082,
      "reward": -0.13629086455330253,
      "reward_std": 0.4956332743167877,
      "rewards/cosine_scaled_reward": -0.06814542971551418,
      "rewards/format_reward": 0.0,
      "step": 333
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3072.9166870117188,
      "epoch": 0.572407883461868,
      "grad_norm": 0.14293867349624634,
      "kl": 0.0254974365234375,
      "learning_rate": 3.72726140684072e-07,
      "loss": 0.0174,
      "reward": 0.02665301039814949,
      "reward_std": 0.6765051260590553,
      "rewards/cosine_scaled_reward": 0.013326505199074745,
      "rewards/format_reward": 0.0,
      "step": 334
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2824.861114501953,
      "epoch": 0.5741216795201372,
      "grad_norm": 0.19958122074604034,
      "kl": 0.0171356201171875,
      "learning_rate": 3.6984293534939737e-07,
      "loss": 0.0929,
      "reward": -0.056068588979542255,
      "reward_std": 0.8257120847702026,
      "rewards/cosine_scaled_reward": -0.028034291230142117,
      "rewards/format_reward": 0.0,
      "step": 335
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2580.0,
      "epoch": 0.5758354755784062,
      "grad_norm": 0.229178324341774,
      "kl": 0.019195556640625,
      "learning_rate": 3.6696851061588994e-07,
      "loss": 0.006,
      "reward": -0.291859433054924,
      "reward_std": 0.4463714547455311,
      "rewards/cosine_scaled_reward": -0.1459297128021717,
      "rewards/format_reward": 0.0,
      "step": 336
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2450.138916015625,
      "epoch": 0.5775492716366752,
      "grad_norm": 0.27258360385894775,
      "kl": 0.0173187255859375,
      "learning_rate": 3.641030065789562e-07,
      "loss": 0.0321,
      "reward": 0.07944206055253744,
      "reward_std": 0.6395395249128342,
      "rewards/cosine_scaled_reward": 0.03972102585248649,
      "rewards/format_reward": 0.0,
      "step": 337
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2720.1805725097656,
      "epoch": 0.5792630676949443,
      "grad_norm": 0.20289485156536102,
      "kl": 0.019683837890625,
      "learning_rate": 3.612465628992203e-07,
      "loss": 0.069,
      "reward": 0.48021042346954346,
      "reward_std": 0.7420852333307266,
      "rewards/cosine_scaled_reward": 0.24010521546006203,
      "rewards/format_reward": 0.0,
      "step": 338
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3105.0833740234375,
      "epoch": 0.5809768637532133,
      "grad_norm": 0.18909570574760437,
      "kl": 0.022308349609375,
      "learning_rate": 3.5839931879571725e-07,
      "loss": 0.0378,
      "reward": -0.22961215861141682,
      "reward_std": 0.5897372663021088,
      "rewards/cosine_scaled_reward": -0.11480608023703098,
      "rewards/format_reward": 0.0,
      "step": 339
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2911.2083129882812,
      "epoch": 0.5826906598114824,
      "grad_norm": 0.20473147928714752,
      "kl": 0.022918701171875,
      "learning_rate": 3.555614130391079e-07,
      "loss": -0.0498,
      "reward": -0.1040644682943821,
      "reward_std": 0.57014200091362,
      "rewards/cosine_scaled_reward": -0.052032231353223324,
      "rewards/format_reward": 0.0,
      "step": 340
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2422.777801513672,
      "epoch": 0.5844044558697515,
      "grad_norm": 0.1749623566865921,
      "kl": 0.019775390625,
      "learning_rate": 3.5273298394491515e-07,
      "loss": 0.0118,
      "reward": -0.429408997297287,
      "reward_std": 0.39798443764448166,
      "rewards/cosine_scaled_reward": -0.2147044911980629,
      "rewards/format_reward": 0.0,
      "step": 341
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2946.736083984375,
      "epoch": 0.5861182519280206,
      "grad_norm": 0.20565366744995117,
      "kl": 0.01824951171875,
      "learning_rate": 3.4991416936678276e-07,
      "loss": 0.0572,
      "reward": -0.09714518021792173,
      "reward_std": 0.6395711675286293,
      "rewards/cosine_scaled_reward": -0.048572588711977005,
      "rewards/format_reward": 0.0,
      "step": 342
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2240.361114501953,
      "epoch": 0.5878320479862896,
      "grad_norm": 0.19630080461502075,
      "kl": 0.013671875,
      "learning_rate": 3.471051066897562e-07,
      "loss": 0.0286,
      "reward": 0.09563972940668464,
      "reward_std": 0.5933751873672009,
      "rewards/cosine_scaled_reward": 0.047819861210882664,
      "rewards/format_reward": 0.0,
      "step": 343
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2915.2916870117188,
      "epoch": 0.5895458440445587,
      "grad_norm": 0.20998388528823853,
      "kl": 0.0269317626953125,
      "learning_rate": 3.4430593282358777e-07,
      "loss": -0.0348,
      "reward": -0.3282645223662257,
      "reward_std": 0.49101946130394936,
      "rewards/cosine_scaled_reward": -0.16413226234726608,
      "rewards/format_reward": 0.0,
      "step": 344
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2668.0277709960938,
      "epoch": 0.5912596401028277,
      "grad_norm": 0.25542527437210083,
      "kl": 0.020050048828125,
      "learning_rate": 3.4151678419606233e-07,
      "loss": 0.0754,
      "reward": 0.21342255361378193,
      "reward_std": 0.653385765850544,
      "rewards/cosine_scaled_reward": 0.10671127680689096,
      "rewards/format_reward": 0.0,
      "step": 345
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2691.2638549804688,
      "epoch": 0.5929734361610969,
      "grad_norm": 0.21436557173728943,
      "kl": 0.0188446044921875,
      "learning_rate": 3.387377967463493e-07,
      "loss": 0.0297,
      "reward": -0.08409620448946953,
      "reward_std": 0.6964321285486221,
      "rewards/cosine_scaled_reward": -0.04204810503870249,
      "rewards/format_reward": 0.0,
      "step": 346
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2423.2083740234375,
      "epoch": 0.5946872322193659,
      "grad_norm": 0.2174253612756729,
      "kl": 0.0170745849609375,
      "learning_rate": 3.359691059183761e-07,
      "loss": -0.0145,
      "reward": 0.05711523536592722,
      "reward_std": 0.6910872906446457,
      "rewards/cosine_scaled_reward": 0.02855762024410069,
      "rewards/format_reward": 0.0,
      "step": 347
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2334.3194580078125,
      "epoch": 0.596401028277635,
      "grad_norm": 0.20871306955814362,
      "kl": 0.019622802734375,
      "learning_rate": 3.3321084665422803e-07,
      "loss": 0.0377,
      "reward": -0.29262126237154007,
      "reward_std": 0.5664101913571358,
      "rewards/cosine_scaled_reward": -0.14631063491106033,
      "rewards/format_reward": 0.0,
      "step": 348
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2684.4861450195312,
      "epoch": 0.598114824335904,
      "grad_norm": 0.2084410935640335,
      "kl": 0.0155181884765625,
      "learning_rate": 3.3046315338757026e-07,
      "loss": -0.0696,
      "reward": 0.2747867554426193,
      "reward_std": 0.6360199972987175,
      "rewards/cosine_scaled_reward": 0.13739337399601936,
      "rewards/format_reward": 0.0,
      "step": 349
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2624.9444580078125,
      "epoch": 0.5998286203941731,
      "grad_norm": 0.27559694647789,
      "kl": 0.01519775390625,
      "learning_rate": 3.2772616003709616e-07,
      "loss": 0.0439,
      "reward": 0.16777711734175682,
      "reward_std": 0.6573140621185303,
      "rewards/cosine_scaled_reward": 0.08388857543468475,
      "rewards/format_reward": 0.0,
      "step": 350
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2644.0416564941406,
      "epoch": 0.6015424164524421,
      "grad_norm": 0.21829356253147125,
      "kl": 0.020263671875,
      "learning_rate": 3.250000000000001e-07,
      "loss": 0.019,
      "reward": 0.04395672678947449,
      "reward_std": 0.5275484099984169,
      "rewards/cosine_scaled_reward": 0.02197836432605982,
      "rewards/format_reward": 0.0,
      "step": 351
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2948.3056030273438,
      "epoch": 0.6032562125107113,
      "grad_norm": 0.15744946897029877,
      "kl": 0.0189056396484375,
      "learning_rate": 3.222848061454764e-07,
      "loss": -0.0085,
      "reward": -0.41702286154031754,
      "reward_std": 0.5593557730317116,
      "rewards/cosine_scaled_reward": -0.20851144194602966,
      "rewards/format_reward": 0.0,
      "step": 352
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2635.7222595214844,
      "epoch": 0.6049700085689803,
      "grad_norm": 0.22034288942813873,
      "kl": 0.021209716796875,
      "learning_rate": 3.195807108082429e-07,
      "loss": -0.0335,
      "reward": -0.30768171697854996,
      "reward_std": 0.5821868106722832,
      "rewards/cosine_scaled_reward": -0.15384084545075893,
      "rewards/format_reward": 0.0,
      "step": 353
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2137.3055725097656,
      "epoch": 0.6066838046272494,
      "grad_norm": 0.276947557926178,
      "kl": 0.015472412109375,
      "learning_rate": 3.168878457820915e-07,
      "loss": 0.0844,
      "reward": 0.3251216746866703,
      "reward_std": 0.716858297586441,
      "rewards/cosine_scaled_reward": 0.16256084106862545,
      "rewards/format_reward": 0.0,
      "step": 354
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2492.5555725097656,
      "epoch": 0.6083976006855184,
      "grad_norm": 0.2037208080291748,
      "kl": 0.0183258056640625,
      "learning_rate": 3.142063423134644e-07,
      "loss": -0.0014,
      "reward": -0.21882931515574455,
      "reward_std": 0.47944844514131546,
      "rewards/cosine_scaled_reward": -0.10941465757787228,
      "rewards/format_reward": 0.0,
      "step": 355
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2614.3472290039062,
      "epoch": 0.6101113967437874,
      "grad_norm": 0.19817198812961578,
      "kl": 0.0220947265625,
      "learning_rate": 3.115363310950578e-07,
      "loss": 0.0141,
      "reward": -0.4298449754714966,
      "reward_std": 0.520567923784256,
      "rewards/cosine_scaled_reward": -0.2149224765598774,
      "rewards/format_reward": 0.0,
      "step": 356
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2584.777801513672,
      "epoch": 0.6118251928020566,
      "grad_norm": 0.18728262186050415,
      "kl": 0.021331787109375,
      "learning_rate": 3.0887794225945143e-07,
      "loss": 0.04,
      "reward": 0.04458676278591156,
      "reward_std": 0.49945997446775436,
      "rewards/cosine_scaled_reward": 0.02229338139295578,
      "rewards/format_reward": 0.0,
      "step": 357
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2934.513916015625,
      "epoch": 0.6135389888603257,
      "grad_norm": 0.17515863478183746,
      "kl": 0.017791748046875,
      "learning_rate": 3.062313053727671e-07,
      "loss": -0.0046,
      "reward": -0.0155550935305655,
      "reward_std": 0.607760101556778,
      "rewards/cosine_scaled_reward": -0.007777547696605325,
      "rewards/format_reward": 0.0,
      "step": 358
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2598.2638549804688,
      "epoch": 0.6152527849185947,
      "grad_norm": 0.20000198483467102,
      "kl": 0.0205535888671875,
      "learning_rate": 3.0359654942835247e-07,
      "loss": -0.008,
      "reward": -0.21508236415684223,
      "reward_std": 0.4807446375489235,
      "rewards/cosine_scaled_reward": -0.10754118673503399,
      "rewards/format_reward": 0.0,
      "step": 359
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2585.3333435058594,
      "epoch": 0.6169665809768637,
      "grad_norm": 0.1761714369058609,
      "kl": 0.01947021484375,
      "learning_rate": 3.0097380284049523e-07,
      "loss": 0.0011,
      "reward": -0.027444179635494947,
      "reward_std": 0.6417821869254112,
      "rewards/cosine_scaled_reward": -0.013722071889787912,
      "rewards/format_reward": 0.0,
      "step": 360
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2367.3611450195312,
      "epoch": 0.6186803770351328,
      "grad_norm": 0.1938982903957367,
      "kl": 0.01788330078125,
      "learning_rate": 2.9836319343816397e-07,
      "loss": -0.023,
      "reward": 0.0992561224848032,
      "reward_std": 0.7357365190982819,
      "rewards/cosine_scaled_reward": 0.04962805658578873,
      "rewards/format_reward": 0.0,
      "step": 361
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3139.541748046875,
      "epoch": 0.6203941730934018,
      "grad_norm": 0.17356501519680023,
      "kl": 0.024200439453125,
      "learning_rate": 2.9576484845877793e-07,
      "loss": -0.0258,
      "reward": -0.128750279545784,
      "reward_std": 0.5727476924657822,
      "rewards/cosine_scaled_reward": -0.06437514536082745,
      "rewards/format_reward": 0.0,
      "step": 362
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2882.4306030273438,
      "epoch": 0.622107969151671,
      "grad_norm": 0.19220975041389465,
      "kl": 0.0243377685546875,
      "learning_rate": 2.931788945420058e-07,
      "loss": -0.0247,
      "reward": -0.019596407189965248,
      "reward_std": 0.6233709305524826,
      "rewards/cosine_scaled_reward": -0.009798200335353613,
      "rewards/format_reward": 0.0,
      "step": 363
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2840.0555419921875,
      "epoch": 0.62382176520994,
      "grad_norm": 0.237908735871315,
      "kl": 0.02325439453125,
      "learning_rate": 2.9060545772359305e-07,
      "loss": 0.0684,
      "reward": -0.17538912501186132,
      "reward_std": 0.7643003761768341,
      "rewards/cosine_scaled_reward": -0.08769455272704363,
      "rewards/format_reward": 0.0,
      "step": 364
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2791.75,
      "epoch": 0.6255355612682091,
      "grad_norm": 0.1972544640302658,
      "kl": 0.022613525390625,
      "learning_rate": 2.8804466342921987e-07,
      "loss": -0.0356,
      "reward": -0.19943542033433914,
      "reward_std": 0.6234779357910156,
      "rewards/cosine_scaled_reward": -0.09971771761775017,
      "rewards/format_reward": 0.0,
      "step": 365
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2936.7916564941406,
      "epoch": 0.6272493573264781,
      "grad_norm": 0.1693785935640335,
      "kl": 0.022491455078125,
      "learning_rate": 2.854966364683872e-07,
      "loss": 0.0289,
      "reward": -0.07167929410934448,
      "reward_std": 0.41813354194164276,
      "rewards/cosine_scaled_reward": -0.035839639604091644,
      "rewards/format_reward": 0.0,
      "step": 366
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2579.8056030273438,
      "epoch": 0.6289631533847472,
      "grad_norm": 0.18452903628349304,
      "kl": 0.02313232421875,
      "learning_rate": 2.829615010283344e-07,
      "loss": 0.0131,
      "reward": 0.13851050520315766,
      "reward_std": 0.6860260739922523,
      "rewards/cosine_scaled_reward": 0.06925524887628853,
      "rewards/format_reward": 0.0,
      "step": 367
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3069.2361450195312,
      "epoch": 0.6306769494430163,
      "grad_norm": 0.207699254155159,
      "kl": 0.026519775390625,
      "learning_rate": 2.8043938066798645e-07,
      "loss": 0.0636,
      "reward": -0.25442312750965357,
      "reward_std": 0.5900055021047592,
      "rewards/cosine_scaled_reward": -0.12721156049519777,
      "rewards/format_reward": 0.0,
      "step": 368
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2244.763916015625,
      "epoch": 0.6323907455012854,
      "grad_norm": 0.17845271527767181,
      "kl": 0.0156707763671875,
      "learning_rate": 2.7793039831193133e-07,
      "loss": 0.0488,
      "reward": 0.17914995457977057,
      "reward_std": 0.7317003160715103,
      "rewards/cosine_scaled_reward": 0.08957497263327241,
      "rewards/format_reward": 0.0,
      "step": 369
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2134.1944274902344,
      "epoch": 0.6341045415595544,
      "grad_norm": 0.2277487814426422,
      "kl": 0.01385498046875,
      "learning_rate": 2.7543467624442956e-07,
      "loss": -0.0127,
      "reward": 0.11734075238928199,
      "reward_std": 0.5018965676426888,
      "rewards/cosine_scaled_reward": 0.05867037340067327,
      "rewards/format_reward": 0.0,
      "step": 370
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2490.500030517578,
      "epoch": 0.6358183376178235,
      "grad_norm": 0.21075375378131866,
      "kl": 0.02069091796875,
      "learning_rate": 2.729523361034538e-07,
      "loss": 0.0493,
      "reward": -0.03656116779893637,
      "reward_std": 0.4987756237387657,
      "rewards/cosine_scaled_reward": -0.018280583899468184,
      "rewards/format_reward": 0.0,
      "step": 371
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2664.625030517578,
      "epoch": 0.6375321336760925,
      "grad_norm": 0.22036224603652954,
      "kl": 0.01885986328125,
      "learning_rate": 2.7048349887476037e-07,
      "loss": 0.0736,
      "reward": -0.017365715699270368,
      "reward_std": 0.7068077325820923,
      "rewards/cosine_scaled_reward": -0.008682856685481966,
      "rewards/format_reward": 0.0,
      "step": 372
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2659.3889770507812,
      "epoch": 0.6392459297343616,
      "grad_norm": 0.2022118866443634,
      "kl": 0.0198822021484375,
      "learning_rate": 2.6802828488599294e-07,
      "loss": 0.011,
      "reward": -0.049437786685302854,
      "reward_std": 0.5779630020260811,
      "rewards/cosine_scaled_reward": -0.024718896602280438,
      "rewards/format_reward": 0.0,
      "step": 373
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2616.4306030273438,
      "epoch": 0.6409597257926307,
      "grad_norm": 0.1780145913362503,
      "kl": 0.0235595703125,
      "learning_rate": 2.655868138008171e-07,
      "loss": 0.0089,
      "reward": -0.017803641967475414,
      "reward_std": 0.6717728674411774,
      "rewards/cosine_scaled_reward": -0.00890181539580226,
      "rewards/format_reward": 0.0,
      "step": 374
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2676.3472290039062,
      "epoch": 0.6426735218508998,
      "grad_norm": 0.15247489511966705,
      "kl": 0.026458740234375,
      "learning_rate": 2.631592046130896e-07,
      "loss": 0.0205,
      "reward": -0.31310519203543663,
      "reward_std": 0.5878890082240105,
      "rewards/cosine_scaled_reward": -0.15655260160565376,
      "rewards/format_reward": 0.0,
      "step": 375
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2356.2083129882812,
      "epoch": 0.6443873179091688,
      "grad_norm": 0.2001314014196396,
      "kl": 0.0235595703125,
      "learning_rate": 2.6074557564105724e-07,
      "loss": 0.0174,
      "reward": -0.2070534396916628,
      "reward_std": 0.4216439947485924,
      "rewards/cosine_scaled_reward": -0.1035267198458314,
      "rewards/format_reward": 0.0,
      "step": 376
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2577.5695190429688,
      "epoch": 0.6461011139674379,
      "grad_norm": 0.19885659217834473,
      "kl": 0.0179290771484375,
      "learning_rate": 2.583460445215911e-07,
      "loss": -0.0114,
      "reward": -0.2356225922703743,
      "reward_std": 0.4705282226204872,
      "rewards/cosine_scaled_reward": -0.1178113017231226,
      "rewards/format_reward": 0.0,
      "step": 377
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2827.4305419921875,
      "epoch": 0.6478149100257069,
      "grad_norm": 0.16866172850131989,
      "kl": 0.022796630859375,
      "learning_rate": 2.5596072820445254e-07,
      "loss": 0.0359,
      "reward": -0.2195772840641439,
      "reward_std": 0.7464367002248764,
      "rewards/cosine_scaled_reward": -0.1097886401694268,
      "rewards/format_reward": 0.0,
      "step": 378
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2001.8472595214844,
      "epoch": 0.6495287060839761,
      "grad_norm": 0.27339431643486023,
      "kl": 0.025421142578125,
      "learning_rate": 2.5358974294659373e-07,
      "loss": -0.0481,
      "reward": -0.053384889382869005,
      "reward_std": 0.7801851779222488,
      "rewards/cosine_scaled_reward": -0.026692438637837768,
      "rewards/format_reward": 0.0,
      "step": 379
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2380.8611450195312,
      "epoch": 0.6512425021422451,
      "grad_norm": 0.49418047070503235,
      "kl": 0.028839111328125,
      "learning_rate": 2.512332043064913e-07,
      "loss": 0.1507,
      "reward": -0.04335943330079317,
      "reward_std": 0.7678016275167465,
      "rewards/cosine_scaled_reward": -0.021679717116057873,
      "rewards/format_reward": 0.0,
      "step": 380
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2910.1806640625,
      "epoch": 0.6529562982005142,
      "grad_norm": 0.19250288605690002,
      "kl": 0.022003173828125,
      "learning_rate": 2.488912271385139e-07,
      "loss": 0.0447,
      "reward": -0.1130654625594616,
      "reward_std": 0.5473960787057877,
      "rewards/cosine_scaled_reward": -0.05653274059295654,
      "rewards/format_reward": 0.0,
      "step": 381
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2682.9861450195312,
      "epoch": 0.6546700942587832,
      "grad_norm": 0.1798926293849945,
      "kl": 0.019439697265625,
      "learning_rate": 2.465639255873246e-07,
      "loss": -0.0224,
      "reward": -0.07310536503791809,
      "reward_std": 0.6817247718572617,
      "rewards/cosine_scaled_reward": -0.036552680656313896,
      "rewards/format_reward": 0.0,
      "step": 382
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2607.763916015625,
      "epoch": 0.6563838903170522,
      "grad_norm": 0.24983283877372742,
      "kl": 0.026153564453125,
      "learning_rate": 2.4425141308231765e-07,
      "loss": 0.0197,
      "reward": -0.24107037298381329,
      "reward_std": 0.6102746799588203,
      "rewards/cosine_scaled_reward": -0.12053518556058407,
      "rewards/format_reward": 0.0,
      "step": 383
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2681.277801513672,
      "epoch": 0.6580976863753213,
      "grad_norm": 0.21532803773880005,
      "kl": 0.0153350830078125,
      "learning_rate": 2.4195380233209006e-07,
      "loss": 0.0375,
      "reward": -0.2287786863744259,
      "reward_std": 0.5439959019422531,
      "rewards/cosine_scaled_reward": -0.11438935063779354,
      "rewards/format_reward": 0.0,
      "step": 384
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2749.4305725097656,
      "epoch": 0.6598114824335904,
      "grad_norm": 0.23645354807376862,
      "kl": 0.02471923828125,
      "learning_rate": 2.3967120531894857e-07,
      "loss": -0.0225,
      "reward": -0.1737481877207756,
      "reward_std": 0.5551631152629852,
      "rewards/cosine_scaled_reward": -0.0868740938603878,
      "rewards/format_reward": 0.0,
      "step": 385
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3019.7361450195312,
      "epoch": 0.6615252784918595,
      "grad_norm": 0.18375760316848755,
      "kl": 0.019195556640625,
      "learning_rate": 2.374037332934512e-07,
      "loss": 0.0429,
      "reward": -0.34039000049233437,
      "reward_std": 0.5544994547963142,
      "rewards/cosine_scaled_reward": -0.17019500210881233,
      "rewards/format_reward": 0.0,
      "step": 386
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2555.625030517578,
      "epoch": 0.6632390745501285,
      "grad_norm": 0.2520519196987152,
      "kl": 0.0201873779296875,
      "learning_rate": 2.3515149676898552e-07,
      "loss": 0.0754,
      "reward": 0.06691954471170902,
      "reward_std": 0.4953342378139496,
      "rewards/cosine_scaled_reward": 0.03345977142453194,
      "rewards/format_reward": 0.0,
      "step": 387
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2198.75,
      "epoch": 0.6649528706083976,
      "grad_norm": 0.21169999241828918,
      "kl": 0.0220489501953125,
      "learning_rate": 2.3291460551638237e-07,
      "loss": -0.0328,
      "reward": 0.10132637619972229,
      "reward_std": 0.6322794482111931,
      "rewards/cosine_scaled_reward": 0.050663191825151443,
      "rewards/format_reward": 0.0,
      "step": 388
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2786.27783203125,
      "epoch": 0.6666666666666666,
      "grad_norm": 0.18405954539775848,
      "kl": 0.0251617431640625,
      "learning_rate": 2.306931685585657e-07,
      "loss": -0.0196,
      "reward": 0.03023771196603775,
      "reward_std": 0.46946871280670166,
      "rewards/cosine_scaled_reward": 0.015118852257728577,
      "rewards/format_reward": 0.0,
      "step": 389
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2521.5693969726562,
      "epoch": 0.6683804627249358,
      "grad_norm": 0.19272808730602264,
      "kl": 0.0200347900390625,
      "learning_rate": 2.2848729416523859e-07,
      "loss": 0.0461,
      "reward": 0.00521535862935707,
      "reward_std": 0.616911455988884,
      "rewards/cosine_scaled_reward": 0.0026076845824718475,
      "rewards/format_reward": 0.0,
      "step": 390
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2864.041748046875,
      "epoch": 0.6700942587832048,
      "grad_norm": 0.2623915672302246,
      "kl": 0.0235595703125,
      "learning_rate": 2.2629708984760706e-07,
      "loss": -0.002,
      "reward": -0.1861814223229885,
      "reward_std": 0.5339604392647743,
      "rewards/cosine_scaled_reward": -0.0930907130241394,
      "rewards/format_reward": 0.0,
      "step": 391
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2380.9583740234375,
      "epoch": 0.6718080548414739,
      "grad_norm": 0.25610801577568054,
      "kl": 0.02032470703125,
      "learning_rate": 2.2412266235313973e-07,
      "loss": -0.0448,
      "reward": -0.07657308876514435,
      "reward_std": 0.6799488365650177,
      "rewards/cosine_scaled_reward": -0.038286540657281876,
      "rewards/format_reward": 0.0,
      "step": 392
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2998.5139770507812,
      "epoch": 0.6735218508997429,
      "grad_norm": 0.19235925376415253,
      "kl": 0.0222015380859375,
      "learning_rate": 2.2196411766036487e-07,
      "loss": 0.0569,
      "reward": -0.001154482364654541,
      "reward_std": 0.5102438926696777,
      "rewards/cosine_scaled_reward": -0.0005772355943918228,
      "rewards/format_reward": 0.0,
      "step": 393
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2115.1806030273438,
      "epoch": 0.675235646958012,
      "grad_norm": 0.2744181752204895,
      "kl": 0.027099609375,
      "learning_rate": 2.1982156097370557e-07,
      "loss": 0.0221,
      "reward": 0.058095297776162624,
      "reward_std": 0.718009740114212,
      "rewards/cosine_scaled_reward": 0.029047648888081312,
      "rewards/format_reward": 0.0,
      "step": 394
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2774.1806030273438,
      "epoch": 0.676949443016281,
      "grad_norm": 0.19175058603286743,
      "kl": 0.025482177734375,
      "learning_rate": 2.1769509671835223e-07,
      "loss": 0.0352,
      "reward": -0.136960469186306,
      "reward_std": 0.511358916759491,
      "rewards/cosine_scaled_reward": -0.0684802271425724,
      "rewards/format_reward": 0.0,
      "step": 395
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3057.9584350585938,
      "epoch": 0.6786632390745502,
      "grad_norm": 0.16223175823688507,
      "kl": 0.019256591796875,
      "learning_rate": 2.1558482853517253e-07,
      "loss": 0.0288,
      "reward": -0.04862111946567893,
      "reward_std": 0.5186164565384388,
      "rewards/cosine_scaled_reward": -0.024310562410391867,
      "rewards/format_reward": 0.0,
      "step": 396
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2582.2916870117188,
      "epoch": 0.6803770351328192,
      "grad_norm": 0.287038654088974,
      "kl": 0.0208740234375,
      "learning_rate": 2.134908592756607e-07,
      "loss": -0.0666,
      "reward": -0.17554645985364914,
      "reward_std": 0.5096240639686584,
      "rewards/cosine_scaled_reward": -0.08777323365211487,
      "rewards/format_reward": 0.0,
      "step": 397
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3108.4444580078125,
      "epoch": 0.6820908311910883,
      "grad_norm": 0.1679101139307022,
      "kl": 0.0231781005859375,
      "learning_rate": 2.1141329099692406e-07,
      "loss": -0.0035,
      "reward": 0.038632214069366455,
      "reward_std": 0.7707736194133759,
      "rewards/cosine_scaled_reward": 0.019316108897328377,
      "rewards/format_reward": 0.0,
      "step": 398
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1893.5972442626953,
      "epoch": 0.6838046272493573,
      "grad_norm": 0.2708974778652191,
      "kl": 0.0231170654296875,
      "learning_rate": 2.0935222495670968e-07,
      "loss": 0.0065,
      "reward": 0.1442592293024063,
      "reward_std": 0.5131981894373894,
      "rewards/cosine_scaled_reward": 0.07212962210178375,
      "rewards/format_reward": 0.0,
      "step": 399
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2922.7916870117188,
      "epoch": 0.6855184233076264,
      "grad_norm": 0.3239152133464813,
      "kl": 0.023956298828125,
      "learning_rate": 2.0730776160846853e-07,
      "loss": -0.0809,
      "reward": -0.12957404926419258,
      "reward_std": 0.5665386915206909,
      "rewards/cosine_scaled_reward": -0.06478701997548342,
      "rewards/format_reward": 0.0,
      "step": 400
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2859.4444580078125,
      "epoch": 0.6872322193658955,
      "grad_norm": 0.19043125212192535,
      "kl": 0.0223541259765625,
      "learning_rate": 2.0528000059645995e-07,
      "loss": 0.0588,
      "reward": -0.32605881802737713,
      "reward_std": 0.5183117464184761,
      "rewards/cosine_scaled_reward": -0.16302942391484976,
      "rewards/format_reward": 0.0,
      "step": 401
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2781.5556030273438,
      "epoch": 0.6889460154241646,
      "grad_norm": 0.26216545701026917,
      "kl": 0.026580810546875,
      "learning_rate": 2.032690407508949e-07,
      "loss": -0.0263,
      "reward": -0.4961502104997635,
      "reward_std": 0.3931718245148659,
      "rewards/cosine_scaled_reward": -0.24807510524988174,
      "rewards/format_reward": 0.0,
      "step": 402
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2736.1111450195312,
      "epoch": 0.6906598114824336,
      "grad_norm": 0.21833109855651855,
      "kl": 0.02435302734375,
      "learning_rate": 2.0127498008311922e-07,
      "loss": 0.0585,
      "reward": 0.3037844013888389,
      "reward_std": 0.5833063200116158,
      "rewards/cosine_scaled_reward": 0.1518922229297459,
      "rewards/format_reward": 0.0,
      "step": 403
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2712.861114501953,
      "epoch": 0.6923736075407027,
      "grad_norm": 0.2146695852279663,
      "kl": 0.025238037109375,
      "learning_rate": 1.9929791578083655e-07,
      "loss": -0.041,
      "reward": -0.21084421500563622,
      "reward_std": 0.4842342808842659,
      "rewards/cosine_scaled_reward": -0.10542210191488266,
      "rewards/format_reward": 0.0,
      "step": 404
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2936.3333129882812,
      "epoch": 0.6940874035989717,
      "grad_norm": 0.18586868047714233,
      "kl": 0.024658203125,
      "learning_rate": 1.9733794420337213e-07,
      "loss": 0.005,
      "reward": 0.050316065549850464,
      "reward_std": 0.5316065326333046,
      "rewards/cosine_scaled_reward": 0.02515802625566721,
      "rewards/format_reward": 0.0,
      "step": 405
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2209.0972595214844,
      "epoch": 0.6958011996572407,
      "grad_norm": 0.2406679093837738,
      "kl": 0.02728271484375,
      "learning_rate": 1.9539516087697517e-07,
      "loss": -0.0131,
      "reward": -0.021612104028463364,
      "reward_std": 0.5742413327097893,
      "rewards/cosine_scaled_reward": -0.010806052014231682,
      "rewards/format_reward": 0.0,
      "step": 406
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2988.3056640625,
      "epoch": 0.6975149957155099,
      "grad_norm": 0.23395532369613647,
      "kl": 0.027374267578125,
      "learning_rate": 1.934696604901642e-07,
      "loss": 0.0598,
      "reward": -0.08433661237359047,
      "reward_std": 0.5562912449240685,
      "rewards/cosine_scaled_reward": -0.04216831736266613,
      "rewards/format_reward": 0.0,
      "step": 407
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2319.6111755371094,
      "epoch": 0.699228791773779,
      "grad_norm": 0.2508476972579956,
      "kl": 0.0188751220703125,
      "learning_rate": 1.915615368891117e-07,
      "loss": -0.0462,
      "reward": 0.5069457921199501,
      "reward_std": 0.5437265560030937,
      "rewards/cosine_scaled_reward": 0.25347290316130966,
      "rewards/format_reward": 0.0,
      "step": 408
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2740.7777709960938,
      "epoch": 0.700942587832048,
      "grad_norm": 0.18038439750671387,
      "kl": 0.030517578125,
      "learning_rate": 1.8967088307307e-07,
      "loss": 0.0239,
      "reward": 0.10421705152839422,
      "reward_std": 0.6194805726408958,
      "rewards/cosine_scaled_reward": 0.052108526695519686,
      "rewards/format_reward": 0.0,
      "step": 409
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2735.8472595214844,
      "epoch": 0.702656383890317,
      "grad_norm": 0.2515905201435089,
      "kl": 0.02587890625,
      "learning_rate": 1.8779779118983867e-07,
      "loss": -0.0311,
      "reward": -0.1710510030388832,
      "reward_std": 0.5620269253849983,
      "rewards/cosine_scaled_reward": -0.0855255089700222,
      "rewards/format_reward": 0.0,
      "step": 410
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2493.388916015625,
      "epoch": 0.7043701799485861,
      "grad_norm": 0.21452462673187256,
      "kl": 0.0193328857421875,
      "learning_rate": 1.8594235253127372e-07,
      "loss": -0.0157,
      "reward": -0.25840797275304794,
      "reward_std": 0.4374122992157936,
      "rewards/cosine_scaled_reward": -0.12920398078858852,
      "rewards/format_reward": 0.0,
      "step": 411
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2339.2083740234375,
      "epoch": 0.7060839760068551,
      "grad_norm": 0.2920970320701599,
      "kl": 0.02203369140625,
      "learning_rate": 1.8410465752883758e-07,
      "loss": -0.0518,
      "reward": -0.25962352380156517,
      "reward_std": 0.5908957123756409,
      "rewards/cosine_scaled_reward": -0.129811754450202,
      "rewards/format_reward": 0.0,
      "step": 412
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2770.0694580078125,
      "epoch": 0.7077977720651243,
      "grad_norm": 0.2641778290271759,
      "kl": 0.026947021484375,
      "learning_rate": 1.822847957491922e-07,
      "loss": 0.061,
      "reward": -0.15783867985010147,
      "reward_std": 0.5947980135679245,
      "rewards/cosine_scaled_reward": -0.07891935110092163,
      "rewards/format_reward": 0.0,
      "step": 413
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2950.1945190429688,
      "epoch": 0.7095115681233933,
      "grad_norm": 0.2011335790157318,
      "kl": 0.024566650390625,
      "learning_rate": 1.804828558898332e-07,
      "loss": 0.0014,
      "reward": -0.0009787320159375668,
      "reward_std": 0.7296510636806488,
      "rewards/cosine_scaled_reward": -0.0004893671721220016,
      "rewards/format_reward": 0.0,
      "step": 414
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2707.9305419921875,
      "epoch": 0.7112253641816624,
      "grad_norm": 0.3163622319698334,
      "kl": 0.027496337890625,
      "learning_rate": 1.7869892577476722e-07,
      "loss": 0.0567,
      "reward": -0.3990987651050091,
      "reward_std": 0.43145136535167694,
      "rewards/cosine_scaled_reward": -0.1995493769645691,
      "rewards/format_reward": 0.0,
      "step": 415
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2968.0833129882812,
      "epoch": 0.7129391602399314,
      "grad_norm": 0.23538915812969208,
      "kl": 0.02618408203125,
      "learning_rate": 1.7693309235023127e-07,
      "loss": 0.0018,
      "reward": -0.08291278406977654,
      "reward_std": 0.4231496602296829,
      "rewards/cosine_scaled_reward": -0.041456387378275394,
      "rewards/format_reward": 0.0,
      "step": 416
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2606.3333129882812,
      "epoch": 0.7146529562982005,
      "grad_norm": 0.3015352785587311,
      "kl": 0.0229949951171875,
      "learning_rate": 1.7518544168045524e-07,
      "loss": 0.0752,
      "reward": -0.3149372674524784,
      "reward_std": 0.6667703241109848,
      "rewards/cosine_scaled_reward": -0.1574686411768198,
      "rewards/format_reward": 0.0,
      "step": 417
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2380.4445190429688,
      "epoch": 0.7163667523564696,
      "grad_norm": 0.22723092138767242,
      "kl": 0.0203704833984375,
      "learning_rate": 1.7345605894346726e-07,
      "loss": 0.0512,
      "reward": -0.34560693614184856,
      "reward_std": 0.4205815941095352,
      "rewards/cosine_scaled_reward": -0.17280346807092428,
      "rewards/format_reward": 0.0,
      "step": 418
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2430.125,
      "epoch": 0.7180805484147387,
      "grad_norm": 0.23899339139461517,
      "kl": 0.02789306640625,
      "learning_rate": 1.7174502842694212e-07,
      "loss": -0.0302,
      "reward": -0.18839553371071815,
      "reward_std": 0.4583168476819992,
      "rewards/cosine_scaled_reward": -0.09419775661081076,
      "rewards/format_reward": 0.0,
      "step": 419
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2552.4304809570312,
      "epoch": 0.7197943444730077,
      "grad_norm": 0.2333478480577469,
      "kl": 0.0173492431640625,
      "learning_rate": 1.7005243352409333e-07,
      "loss": 0.0486,
      "reward": -0.1561539713293314,
      "reward_std": 0.6325561329722404,
      "rewards/cosine_scaled_reward": -0.07807699032127857,
      "rewards/format_reward": 0.0,
      "step": 420
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2746.0694580078125,
      "epoch": 0.7215081405312768,
      "grad_norm": 0.2854664921760559,
      "kl": 0.0223388671875,
      "learning_rate": 1.6837835672960831e-07,
      "loss": 0.0759,
      "reward": -0.08271846733987331,
      "reward_std": 0.6506856456398964,
      "rewards/cosine_scaled_reward": -0.04135924857109785,
      "rewards/format_reward": 0.0,
      "step": 421
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2300.3055725097656,
      "epoch": 0.7232219365895458,
      "grad_norm": 0.17767125368118286,
      "kl": 0.0181427001953125,
      "learning_rate": 1.6672287963562852e-07,
      "loss": 0.0242,
      "reward": -0.16465576738119125,
      "reward_std": 0.5095989629626274,
      "rewards/cosine_scaled_reward": -0.08232788741588593,
      "rewards/format_reward": 0.0,
      "step": 422
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2779.0694580078125,
      "epoch": 0.7249357326478149,
      "grad_norm": 0.1646861433982849,
      "kl": 0.019927978515625,
      "learning_rate": 1.6508608292777203e-07,
      "loss": 0.0035,
      "reward": -0.05413434375077486,
      "reward_std": 0.7594424337148666,
      "rewards/cosine_scaled_reward": -0.02706717373803258,
      "rewards/format_reward": 0.0,
      "step": 423
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2956.7083129882812,
      "epoch": 0.726649528706084,
      "grad_norm": 0.22844961285591125,
      "kl": 0.03106689453125,
      "learning_rate": 1.6346804638120098e-07,
      "loss": -0.023,
      "reward": -0.16962197236716747,
      "reward_std": 0.6577330157160759,
      "rewards/cosine_scaled_reward": -0.08481098245829344,
      "rewards/format_reward": 0.0,
      "step": 424
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2825.4444580078125,
      "epoch": 0.7283633247643531,
      "grad_norm": 0.21431787312030792,
      "kl": 0.03460693359375,
      "learning_rate": 1.6186884885673413e-07,
      "loss": -0.0182,
      "reward": -0.06549269519746304,
      "reward_std": 0.6411803439259529,
      "rewards/cosine_scaled_reward": -0.032746341079473495,
      "rewards/format_reward": 0.0,
      "step": 425
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2121.5556030273438,
      "epoch": 0.7300771208226221,
      "grad_norm": 0.28984084725379944,
      "kl": 0.0258026123046875,
      "learning_rate": 1.6028856829700258e-07,
      "loss": -0.0539,
      "reward": 0.15816697012633085,
      "reward_std": 0.5270659551024437,
      "rewards/cosine_scaled_reward": 0.07908349251374602,
      "rewards/format_reward": 0.0,
      "step": 426
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2789.486083984375,
      "epoch": 0.7317909168808912,
      "grad_norm": 0.20834913849830627,
      "kl": 0.022003173828125,
      "learning_rate": 1.5872728172265146e-07,
      "loss": -0.0337,
      "reward": -0.370651263743639,
      "reward_std": 0.526657946407795,
      "rewards/cosine_scaled_reward": -0.18532563000917435,
      "rewards/format_reward": 0.0,
      "step": 427
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2442.777801513672,
      "epoch": 0.7335047129391602,
      "grad_norm": 0.2500321567058563,
      "kl": 0.021209716796875,
      "learning_rate": 1.5718506522858572e-07,
      "loss": 0.0647,
      "reward": 0.2879646308720112,
      "reward_std": 0.6987240761518478,
      "rewards/cosine_scaled_reward": 0.1439823191612959,
      "rewards/format_reward": 0.0,
      "step": 428
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2859.0694580078125,
      "epoch": 0.7352185089974294,
      "grad_norm": 0.17108042538166046,
      "kl": 0.02716064453125,
      "learning_rate": 1.5566199398026147e-07,
      "loss": 0.0365,
      "reward": -0.21791245974600315,
      "reward_std": 0.5681828185915947,
      "rewards/cosine_scaled_reward": -0.10895622940734029,
      "rewards/format_reward": 0.0,
      "step": 429
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2675.52783203125,
      "epoch": 0.7369323050556984,
      "grad_norm": 0.18789908289909363,
      "kl": 0.022308349609375,
      "learning_rate": 1.5415814221002265e-07,
      "loss": 0.0154,
      "reward": -0.023968554101884365,
      "reward_std": 0.5900578051805496,
      "rewards/cosine_scaled_reward": -0.01198427053168416,
      "rewards/format_reward": 0.0,
      "step": 430
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2358.041717529297,
      "epoch": 0.7386461011139674,
      "grad_norm": 0.24318993091583252,
      "kl": 0.022705078125,
      "learning_rate": 1.5267358321348285e-07,
      "loss": 0.0687,
      "reward": 0.029904491268098354,
      "reward_std": 0.7376819550991058,
      "rewards/cosine_scaled_reward": 0.01495224516838789,
      "rewards/format_reward": 0.0,
      "step": 431
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3008.9306030273438,
      "epoch": 0.7403598971722365,
      "grad_norm": 0.17112420499324799,
      "kl": 0.0255126953125,
      "learning_rate": 1.5120838934595337e-07,
      "loss": 0.0164,
      "reward": -0.09738675877451897,
      "reward_std": 0.39827052876353264,
      "rewards/cosine_scaled_reward": -0.04869337775744498,
      "rewards/format_reward": 0.0,
      "step": 432
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2719.166717529297,
      "epoch": 0.7420736932305055,
      "grad_norm": 0.17819786071777344,
      "kl": 0.024810791015625,
      "learning_rate": 1.4976263201891613e-07,
      "loss": 0.0207,
      "reward": 0.0039961859583854675,
      "reward_std": 0.4406754970550537,
      "rewards/cosine_scaled_reward": 0.0019980808719992638,
      "rewards/format_reward": 0.0,
      "step": 433
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2033.6805419921875,
      "epoch": 0.7437874892887746,
      "grad_norm": 0.25923460721969604,
      "kl": 0.0193939208984375,
      "learning_rate": 1.483363816965435e-07,
      "loss": 0.0555,
      "reward": -0.2742752702906728,
      "reward_std": 0.617987684905529,
      "rewards/cosine_scaled_reward": -0.1371376351453364,
      "rewards/format_reward": 0.0,
      "step": 434
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2954.9027709960938,
      "epoch": 0.7455012853470437,
      "grad_norm": 0.21946591138839722,
      "kl": 0.021759033203125,
      "learning_rate": 1.469297078922642e-07,
      "loss": 0.0302,
      "reward": 0.07878507301211357,
      "reward_std": 0.5823550596833229,
      "rewards/cosine_scaled_reward": 0.03939253278076649,
      "rewards/format_reward": 0.0,
      "step": 435
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2508.8056030273438,
      "epoch": 0.7472150814053128,
      "grad_norm": 0.18389303982257843,
      "kl": 0.022979736328125,
      "learning_rate": 1.4554267916537495e-07,
      "loss": 0.0511,
      "reward": 0.11886966414749622,
      "reward_std": 0.6237533167004585,
      "rewards/cosine_scaled_reward": 0.05943482369184494,
      "rewards/format_reward": 0.0,
      "step": 436
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2480.375030517578,
      "epoch": 0.7489288774635818,
      "grad_norm": 0.18969161808490753,
      "kl": 0.022491455078125,
      "learning_rate": 1.4417536311769885e-07,
      "loss": -0.0202,
      "reward": -0.3488190211355686,
      "reward_std": 0.6528129577636719,
      "rewards/cosine_scaled_reward": -0.17440950870513916,
      "rewards/format_reward": 0.0,
      "step": 437
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2536.4444580078125,
      "epoch": 0.7506426735218509,
      "grad_norm": 0.2745197117328644,
      "kl": 0.0180511474609375,
      "learning_rate": 1.4282782639029128e-07,
      "loss": -0.0504,
      "reward": 0.2845611646771431,
      "reward_std": 0.4479832947254181,
      "rewards/cosine_scaled_reward": 0.14228056371212006,
      "rewards/format_reward": 0.0,
      "step": 438
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3101.6528930664062,
      "epoch": 0.7523564695801199,
      "grad_norm": 0.16732795536518097,
      "kl": 0.023773193359375,
      "learning_rate": 1.4150013466019114e-07,
      "loss": -0.0111,
      "reward": -0.2682619922561571,
      "reward_std": 0.6106480062007904,
      "rewards/cosine_scaled_reward": -0.134130991587881,
      "rewards/format_reward": 0.0,
      "step": 439
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2572.5555725097656,
      "epoch": 0.7540702656383891,
      "grad_norm": 0.19039277732372284,
      "kl": 0.023834228515625,
      "learning_rate": 1.4019235263722034e-07,
      "loss": -0.0026,
      "reward": 0.057762331794947386,
      "reward_std": 0.5597369149327278,
      "rewards/cosine_scaled_reward": 0.02888116310350597,
      "rewards/format_reward": 0.0,
      "step": 440
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2865.7916259765625,
      "epoch": 0.7557840616966581,
      "grad_norm": 0.19560997188091278,
      "kl": 0.024139404296875,
      "learning_rate": 1.3890454406082956e-07,
      "loss": 0.0056,
      "reward": -0.09995577030349523,
      "reward_std": 0.6689890846610069,
      "rewards/cosine_scaled_reward": -0.049977882008533925,
      "rewards/format_reward": 0.0,
      "step": 441
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2008.4305419921875,
      "epoch": 0.7574978577549272,
      "grad_norm": 0.2906733751296997,
      "kl": 0.02508544921875,
      "learning_rate": 1.3763677169699217e-07,
      "loss": -0.0189,
      "reward": 0.1475011482834816,
      "reward_std": 0.6993541121482849,
      "rewards/cosine_scaled_reward": 0.07375057972967625,
      "rewards/format_reward": 0.0,
      "step": 442
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2263.736114501953,
      "epoch": 0.7592116538131962,
      "grad_norm": 0.27822345495224,
      "kl": 0.027435302734375,
      "learning_rate": 1.3638909733514452e-07,
      "loss": -0.0396,
      "reward": 0.08332556113600731,
      "reward_std": 0.692223846912384,
      "rewards/cosine_scaled_reward": 0.04166277777403593,
      "rewards/format_reward": 0.0,
      "step": 443
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2766.52783203125,
      "epoch": 0.7609254498714653,
      "grad_norm": 0.20186901092529297,
      "kl": 0.02777099609375,
      "learning_rate": 1.351615817851748e-07,
      "loss": 0.0635,
      "reward": 0.03594814520329237,
      "reward_std": 0.6744156032800674,
      "rewards/cosine_scaled_reward": 0.017974070739001036,
      "rewards/format_reward": 0.0,
      "step": 444
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3045.90283203125,
      "epoch": 0.7626392459297343,
      "grad_norm": 0.24945306777954102,
      "kl": 0.0214080810546875,
      "learning_rate": 1.3395428487445914e-07,
      "loss": 0.0559,
      "reward": -0.03293860936537385,
      "reward_std": 0.6841256394982338,
      "rewards/cosine_scaled_reward": -0.016469309804961085,
      "rewards/format_reward": 0.0,
      "step": 445
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2726.013885498047,
      "epoch": 0.7643530419880035,
      "grad_norm": 0.267711341381073,
      "kl": 0.02545166015625,
      "learning_rate": 1.3276726544494571e-07,
      "loss": -0.0273,
      "reward": 0.17773457616567612,
      "reward_std": 0.47991518676280975,
      "rewards/cosine_scaled_reward": 0.08886728808283806,
      "rewards/format_reward": 0.0,
      "step": 446
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2652.3750610351562,
      "epoch": 0.7660668380462725,
      "grad_norm": 0.1955571472644806,
      "kl": 0.024688720703125,
      "learning_rate": 1.316005813502869e-07,
      "loss": -0.0013,
      "reward": -0.21300538629293442,
      "reward_std": 0.5716921538114548,
      "rewards/cosine_scaled_reward": -0.10650269035249949,
      "rewards/format_reward": 0.0,
      "step": 447
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2869.0972290039062,
      "epoch": 0.7677806341045416,
      "grad_norm": 0.18771569430828094,
      "kl": 0.020751953125,
      "learning_rate": 1.3045428945301953e-07,
      "loss": 0.0449,
      "reward": 0.015052955597639084,
      "reward_std": 0.6415582820773125,
      "rewards/cosine_scaled_reward": 0.007526477798819542,
      "rewards/format_reward": 0.0,
      "step": 448
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2951.72216796875,
      "epoch": 0.7694944301628106,
      "grad_norm": 0.186003640294075,
      "kl": 0.0264739990234375,
      "learning_rate": 1.2932844562179352e-07,
      "loss": -0.0117,
      "reward": -0.1732272356748581,
      "reward_std": 0.6033661440014839,
      "rewards/cosine_scaled_reward": -0.0866136197000742,
      "rewards/format_reward": 0.0,
      "step": 449
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2393.916717529297,
      "epoch": 0.7712082262210797,
      "grad_norm": 0.18100591003894806,
      "kl": 0.018280029296875,
      "learning_rate": 1.2822310472864885e-07,
      "loss": 0.0174,
      "reward": -0.17222392931580544,
      "reward_std": 0.4759965166449547,
      "rewards/cosine_scaled_reward": -0.08611196093261242,
      "rewards/format_reward": 0.0,
      "step": 450
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3247.5555419921875,
      "epoch": 0.7729220222793488,
      "grad_norm": 0.16927795112133026,
      "kl": 0.026092529296875,
      "learning_rate": 1.2713832064634125e-07,
      "loss": 0.0101,
      "reward": -0.1602705717086792,
      "reward_std": 0.5965098738670349,
      "rewards/cosine_scaled_reward": -0.0801352858543396,
      "rewards/format_reward": 0.0,
      "step": 451
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2699.2222900390625,
      "epoch": 0.7746358183376179,
      "grad_norm": 0.16316962242126465,
      "kl": 0.0219879150390625,
      "learning_rate": 1.260741462457165e-07,
      "loss": 0.055,
      "reward": -0.06079525873064995,
      "reward_std": 0.6986799910664558,
      "rewards/cosine_scaled_reward": -0.030397622846066952,
      "rewards/format_reward": 0.0,
      "step": 452
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2595.3472290039062,
      "epoch": 0.7763496143958869,
      "grad_norm": 0.5946676135063171,
      "kl": 0.029022216796875,
      "learning_rate": 1.2503063339313356e-07,
      "loss": -0.0538,
      "reward": -0.26486414577811956,
      "reward_std": 0.415864534676075,
      "rewards/cosine_scaled_reward": -0.13243207102641463,
      "rewards/format_reward": 0.0,
      "step": 453
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2008.9305725097656,
      "epoch": 0.778063410454156,
      "grad_norm": 0.19199617207050323,
      "kl": 0.0153961181640625,
      "learning_rate": 1.2400783294793668e-07,
      "loss": 0.0063,
      "reward": 0.13748213648796082,
      "reward_std": 0.6150016859173775,
      "rewards/cosine_scaled_reward": 0.06874106079339981,
      "rewards/format_reward": 0.0,
      "step": 454
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2725.0000610351562,
      "epoch": 0.779777206512425,
      "grad_norm": 0.1928061991930008,
      "kl": 0.02520751953125,
      "learning_rate": 1.2300579475997657e-07,
      "loss": 0.0235,
      "reward": -0.0030081644654273987,
      "reward_std": 0.7155122309923172,
      "rewards/cosine_scaled_reward": -0.0015040775761008263,
      "rewards/format_reward": 0.0,
      "step": 455
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2560.888916015625,
      "epoch": 0.781491002570694,
      "grad_norm": 0.2278081625699997,
      "kl": 0.0233154296875,
      "learning_rate": 1.220245676671809e-07,
      "loss": 0.0422,
      "reward": -0.040069979906547815,
      "reward_std": 0.6579814180731773,
      "rewards/cosine_scaled_reward": -0.02003499452257529,
      "rewards/format_reward": 0.0,
      "step": 456
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2620.638916015625,
      "epoch": 0.7832047986289632,
      "grad_norm": 0.2580767571926117,
      "kl": 0.0252532958984375,
      "learning_rate": 1.2106419949317388e-07,
      "loss": 0.033,
      "reward": 0.23012623190879822,
      "reward_std": 0.6976396143436432,
      "rewards/cosine_scaled_reward": 0.11506311595439911,
      "rewards/format_reward": 0.0,
      "step": 457
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2410.513946533203,
      "epoch": 0.7849185946872322,
      "grad_norm": 0.2296840250492096,
      "kl": 0.0242919921875,
      "learning_rate": 1.2012473704494537e-07,
      "loss": -0.0221,
      "reward": -0.00019283778965473175,
      "reward_std": 0.6016373038291931,
      "rewards/cosine_scaled_reward": -9.64207574725151e-05,
      "rewards/format_reward": 0.0,
      "step": 458
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2265.4583435058594,
      "epoch": 0.7866323907455013,
      "grad_norm": 0.23107987642288208,
      "kl": 0.0201568603515625,
      "learning_rate": 1.1920622611056974e-07,
      "loss": -0.0309,
      "reward": 0.18072006362490356,
      "reward_std": 0.5644106566905975,
      "rewards/cosine_scaled_reward": 0.09036003064829856,
      "rewards/format_reward": 0.0,
      "step": 459
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2700.541748046875,
      "epoch": 0.7883461868037703,
      "grad_norm": 0.21530865132808685,
      "kl": 0.024566650390625,
      "learning_rate": 1.1830871145697412e-07,
      "loss": 0.0267,
      "reward": 0.030735374661162496,
      "reward_std": 0.7207788527011871,
      "rewards/cosine_scaled_reward": 0.01536769128870219,
      "rewards/format_reward": 0.0,
      "step": 460
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2300.8333435058594,
      "epoch": 0.7900599828620394,
      "grad_norm": 0.31069549918174744,
      "kl": 0.023162841796875,
      "learning_rate": 1.1743223682775649e-07,
      "loss": -0.0563,
      "reward": 0.09233328700065613,
      "reward_std": 0.7090381979942322,
      "rewards/cosine_scaled_reward": 0.046166639775037766,
      "rewards/format_reward": 0.0,
      "step": 461
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2608.8056030273438,
      "epoch": 0.7917737789203085,
      "grad_norm": 0.20271840691566467,
      "kl": 0.0250244140625,
      "learning_rate": 1.1657684494105386e-07,
      "loss": 0.0362,
      "reward": -0.07103721424937248,
      "reward_std": 0.7956888303160667,
      "rewards/cosine_scaled_reward": -0.03551860898733139,
      "rewards/format_reward": 0.0,
      "step": 462
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2780.4444580078125,
      "epoch": 0.7934875749785776,
      "grad_norm": 0.2081584334373474,
      "kl": 0.0209808349609375,
      "learning_rate": 1.1574257748745986e-07,
      "loss": -0.0471,
      "reward": 0.021411696448922157,
      "reward_std": 0.48744403570890427,
      "rewards/cosine_scaled_reward": 0.010705851949751377,
      "rewards/format_reward": 0.0,
      "step": 463
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2673.4861450195312,
      "epoch": 0.7952013710368466,
      "grad_norm": 0.19560140371322632,
      "kl": 0.0238037109375,
      "learning_rate": 1.1492947512799328e-07,
      "loss": -0.0409,
      "reward": 0.18470758572220802,
      "reward_std": 0.5649774596095085,
      "rewards/cosine_scaled_reward": 0.09235379751771688,
      "rewards/format_reward": 0.0,
      "step": 464
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3105.27783203125,
      "epoch": 0.7969151670951157,
      "grad_norm": 0.20849952101707458,
      "kl": 0.030609130859375,
      "learning_rate": 1.1413757749211602e-07,
      "loss": 0.041,
      "reward": -0.284846730530262,
      "reward_std": 0.5418054684996605,
      "rewards/cosine_scaled_reward": -0.14242336247116327,
      "rewards/format_reward": 0.0,
      "step": 465
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2965.8611450195312,
      "epoch": 0.7986289631533847,
      "grad_norm": 0.17242176830768585,
      "kl": 0.025604248046875,
      "learning_rate": 1.1336692317580158e-07,
      "loss": 0.0034,
      "reward": -0.3226154297590256,
      "reward_std": 0.5333989933133125,
      "rewards/cosine_scaled_reward": -0.16130771208554506,
      "rewards/format_reward": 0.0,
      "step": 466
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3079.6806030273438,
      "epoch": 0.8003427592116538,
      "grad_norm": 0.20415925979614258,
      "kl": 0.0255126953125,
      "learning_rate": 1.1261754973965422e-07,
      "loss": 0.0969,
      "reward": -0.24770671501755714,
      "reward_std": 0.5701889246702194,
      "rewards/cosine_scaled_reward": -0.12385335750877857,
      "rewards/format_reward": 0.0,
      "step": 467
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2234.986083984375,
      "epoch": 0.8020565552699229,
      "grad_norm": 0.3467549979686737,
      "kl": 0.024200439453125,
      "learning_rate": 1.1188949370707787e-07,
      "loss": 0.0855,
      "reward": -0.009742069989442825,
      "reward_std": 0.591868631541729,
      "rewards/cosine_scaled_reward": -0.004871031269431114,
      "rewards/format_reward": 0.0,
      "step": 468
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2717.0833435058594,
      "epoch": 0.803770351328192,
      "grad_norm": 0.20191779732704163,
      "kl": 0.0237579345703125,
      "learning_rate": 1.1118279056249653e-07,
      "loss": 0.0622,
      "reward": 0.058277749456465244,
      "reward_std": 0.7684449702501297,
      "rewards/cosine_scaled_reward": 0.029138876125216484,
      "rewards/format_reward": 0.0,
      "step": 469
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2607.1388549804688,
      "epoch": 0.805484147386461,
      "grad_norm": 0.2015339732170105,
      "kl": 0.0255126953125,
      "learning_rate": 1.1049747474962444e-07,
      "loss": -0.0145,
      "reward": -0.2184823751449585,
      "reward_std": 0.5644990280270576,
      "rewards/cosine_scaled_reward": -0.10924118757247925,
      "rewards/format_reward": 0.0,
      "step": 470
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3260.3333740234375,
      "epoch": 0.8071979434447301,
      "grad_norm": 0.2259937822818756,
      "kl": 0.02581787109375,
      "learning_rate": 1.0983357966978745e-07,
      "loss": 0.0565,
      "reward": -0.22493689320981503,
      "reward_std": 0.5675350055098534,
      "rewards/cosine_scaled_reward": -0.11246845219284296,
      "rewards/format_reward": 0.0,
      "step": 471
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2854.1806030273438,
      "epoch": 0.8089117395029991,
      "grad_norm": 0.21495820581912994,
      "kl": 0.028594970703125,
      "learning_rate": 1.0919113768029517e-07,
      "loss": -0.0313,
      "reward": 0.14072632044553757,
      "reward_std": 0.6395101621747017,
      "rewards/cosine_scaled_reward": 0.07036316394805908,
      "rewards/format_reward": 0.0,
      "step": 472
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2890.4444580078125,
      "epoch": 0.8106255355612683,
      "grad_norm": 0.18685470521450043,
      "kl": 0.023468017578125,
      "learning_rate": 1.0857018009286381e-07,
      "loss": -0.0026,
      "reward": -0.1240294948220253,
      "reward_std": 0.48969001322984695,
      "rewards/cosine_scaled_reward": -0.06201474368572235,
      "rewards/format_reward": 0.0,
      "step": 473
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2470.0972900390625,
      "epoch": 0.8123393316195373,
      "grad_norm": 0.2818465232849121,
      "kl": 0.029144287109375,
      "learning_rate": 1.0797073717209013e-07,
      "loss": 0.0035,
      "reward": 0.24050107831135392,
      "reward_std": 0.5852163806557655,
      "rewards/cosine_scaled_reward": 0.12025054381228983,
      "rewards/format_reward": 0.0,
      "step": 474
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2933.541748046875,
      "epoch": 0.8140531276778064,
      "grad_norm": 0.19675259292125702,
      "kl": 0.027496337890625,
      "learning_rate": 1.0739283813397639e-07,
      "loss": 0.0321,
      "reward": -0.22623535431921482,
      "reward_std": 0.6677599251270294,
      "rewards/cosine_scaled_reward": -0.1131176782073453,
      "rewards/format_reward": 0.0,
      "step": 475
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2794.9862060546875,
      "epoch": 0.8157669237360754,
      "grad_norm": 0.24406589567661285,
      "kl": 0.02947998046875,
      "learning_rate": 1.068365111445064e-07,
      "loss": -0.0179,
      "reward": -0.3521595522761345,
      "reward_std": 0.5985631048679352,
      "rewards/cosine_scaled_reward": -0.17607977613806725,
      "rewards/format_reward": 0.0,
      "step": 476
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2567.4722290039062,
      "epoch": 0.8174807197943444,
      "grad_norm": 0.17105937004089355,
      "kl": 0.016357421875,
      "learning_rate": 1.063017833182728e-07,
      "loss": 0.0385,
      "reward": -0.2558911629021168,
      "reward_std": 0.4246537983417511,
      "rewards/cosine_scaled_reward": -0.12794558703899384,
      "rewards/format_reward": 0.0,
      "step": 477
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2355.6806030273438,
      "epoch": 0.8191945158526135,
      "grad_norm": 0.24741511046886444,
      "kl": 0.02423095703125,
      "learning_rate": 1.0578868071715544e-07,
      "loss": 0.0854,
      "reward": -0.1565770129673183,
      "reward_std": 0.6820876449346542,
      "rewards/cosine_scaled_reward": -0.07828850811347365,
      "rewards/format_reward": 0.0,
      "step": 478
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2624.0833435058594,
      "epoch": 0.8209083119108826,
      "grad_norm": 0.20646637678146362,
      "kl": 0.0313720703125,
      "learning_rate": 1.0529722834905125e-07,
      "loss": 0.0146,
      "reward": 0.1560894399881363,
      "reward_std": 0.5770560130476952,
      "rewards/cosine_scaled_reward": 0.0780447069555521,
      "rewards/format_reward": 0.0,
      "step": 479
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2327.7222290039062,
      "epoch": 0.8226221079691517,
      "grad_norm": 0.23056431114673615,
      "kl": 0.01910400390625,
      "learning_rate": 1.0482745016665526e-07,
      "loss": 0.0635,
      "reward": 0.149917745962739,
      "reward_std": 0.7103602811694145,
      "rewards/cosine_scaled_reward": 0.07495887111872435,
      "rewards/format_reward": 0.0,
      "step": 480
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2461.5972900390625,
      "epoch": 0.8243359040274207,
      "grad_norm": 0.24706712365150452,
      "kl": 0.03350830078125,
      "learning_rate": 1.0437936906629334e-07,
      "loss": 0.0029,
      "reward": -0.1800133902579546,
      "reward_std": 0.49367547780275345,
      "rewards/cosine_scaled_reward": -0.09000669163651764,
      "rewards/format_reward": 0.0,
      "step": 481
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2864.0694580078125,
      "epoch": 0.8260497000856898,
      "grad_norm": 0.25076547265052795,
      "kl": 0.021514892578125,
      "learning_rate": 1.0395300688680625e-07,
      "loss": 0.0445,
      "reward": -0.14674655348062515,
      "reward_std": 0.5242787301540375,
      "rewards/cosine_scaled_reward": -0.07337328046560287,
      "rewards/format_reward": 0.0,
      "step": 482
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2596.874969482422,
      "epoch": 0.8277634961439588,
      "grad_norm": 0.2056618481874466,
      "kl": 0.02398681640625,
      "learning_rate": 1.0354838440848501e-07,
      "loss": 0.0449,
      "reward": -0.12835523579269648,
      "reward_std": 0.5452019795775414,
      "rewards/cosine_scaled_reward": -0.06417762162163854,
      "rewards/format_reward": 0.0,
      "step": 483
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2179.500030517578,
      "epoch": 0.829477292202228,
      "grad_norm": 0.17514649033546448,
      "kl": 0.0175018310546875,
      "learning_rate": 1.0316552135205837e-07,
      "loss": 0.058,
      "reward": -0.154528075363487,
      "reward_std": 0.5336438938975334,
      "rewards/cosine_scaled_reward": -0.07726403628475964,
      "rewards/format_reward": 0.0,
      "step": 484
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2221.6666259765625,
      "epoch": 0.831191088260497,
      "grad_norm": 0.2613643705844879,
      "kl": 0.028350830078125,
      "learning_rate": 1.0280443637773163e-07,
      "loss": 0.0496,
      "reward": -0.2054775208234787,
      "reward_std": 0.5721682235598564,
      "rewards/cosine_scaled_reward": -0.1027387659996748,
      "rewards/format_reward": 0.0,
      "step": 485
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2448.3193969726562,
      "epoch": 0.8329048843187661,
      "grad_norm": 0.2419876903295517,
      "kl": 0.028411865234375,
      "learning_rate": 1.0246514708427701e-07,
      "loss": -0.0397,
      "reward": -0.06807173043489456,
      "reward_std": 0.507116761058569,
      "rewards/cosine_scaled_reward": -0.03403585962951183,
      "rewards/format_reward": 0.0,
      "step": 486
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2735.7084045410156,
      "epoch": 0.8346186803770351,
      "grad_norm": 0.2406454235315323,
      "kl": 0.0244140625,
      "learning_rate": 1.0214767000817596e-07,
      "loss": 0.0244,
      "reward": -0.2896123267710209,
      "reward_std": 0.5326507315039635,
      "rewards/cosine_scaled_reward": -0.1448061689734459,
      "rewards/format_reward": 0.0,
      "step": 487
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2310.27783203125,
      "epoch": 0.8363324764353042,
      "grad_norm": 0.252458781003952,
      "kl": 0.021209716796875,
      "learning_rate": 1.0185202062281336e-07,
      "loss": 0.0239,
      "reward": 0.10779337584972382,
      "reward_std": 0.6982715576887131,
      "rewards/cosine_scaled_reward": 0.053896697354502976,
      "rewards/format_reward": 0.0,
      "step": 488
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2743.541748046875,
      "epoch": 0.8380462724935732,
      "grad_norm": 0.17434372007846832,
      "kl": 0.024749755859375,
      "learning_rate": 1.0157821333772304e-07,
      "loss": 0.0124,
      "reward": -0.16761679388582706,
      "reward_std": 0.6089917570352554,
      "rewards/cosine_scaled_reward": -0.08380839880555868,
      "rewards/format_reward": 0.0,
      "step": 489
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2486.236114501953,
      "epoch": 0.8397600685518424,
      "grad_norm": 0.2090701311826706,
      "kl": 0.027984619140625,
      "learning_rate": 1.013262614978859e-07,
      "loss": 0.0036,
      "reward": -0.05014536017552018,
      "reward_std": 0.5763295590877533,
      "rewards/cosine_scaled_reward": -0.025072677060961723,
      "rewards/format_reward": 0.0,
      "step": 490
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2848.888916015625,
      "epoch": 0.8414738646101114,
      "grad_norm": 0.22741778194904327,
      "kl": 0.029052734375,
      "learning_rate": 1.0109617738307911e-07,
      "loss": 0.0629,
      "reward": -0.16439465060830116,
      "reward_std": 0.6782207787036896,
      "rewards/cosine_scaled_reward": -0.08219731226563454,
      "rewards/format_reward": 0.0,
      "step": 491
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3189.6250610351562,
      "epoch": 0.8431876606683805,
      "grad_norm": 0.17802605032920837,
      "kl": 0.028839111328125,
      "learning_rate": 1.0088797220727779e-07,
      "loss": 0.0435,
      "reward": -0.35712628811597824,
      "reward_std": 0.6088190823793411,
      "rewards/cosine_scaled_reward": -0.17856314033269882,
      "rewards/format_reward": 0.0,
      "step": 492
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2387.888885498047,
      "epoch": 0.8449014567266495,
      "grad_norm": 0.2299438714981079,
      "kl": 0.025177001953125,
      "learning_rate": 1.0070165611810855e-07,
      "loss": 0.0576,
      "reward": -0.17006561160087585,
      "reward_std": 0.3991905003786087,
      "rewards/cosine_scaled_reward": -0.08503280207514763,
      "rewards/format_reward": 0.0,
      "step": 493
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2941.8611450195312,
      "epoch": 0.8466152527849186,
      "grad_norm": 0.18643692135810852,
      "kl": 0.0261383056640625,
      "learning_rate": 1.005372381963547e-07,
      "loss": 0.0474,
      "reward": -0.04825907852500677,
      "reward_std": 0.6627323552966118,
      "rewards/cosine_scaled_reward": -0.02412955043837428,
      "rewards/format_reward": 0.0,
      "step": 494
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2563.375030517578,
      "epoch": 0.8483290488431876,
      "grad_norm": 0.32022979855537415,
      "kl": 0.03094482421875,
      "learning_rate": 1.0039472645551372e-07,
      "loss": -0.0218,
      "reward": 0.03584544826298952,
      "reward_std": 0.5564405769109726,
      "rewards/cosine_scaled_reward": 0.017922731814906,
      "rewards/format_reward": 0.0,
      "step": 495
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2572.77783203125,
      "epoch": 0.8500428449014568,
      "grad_norm": 0.23663800954818726,
      "kl": 0.0213623046875,
      "learning_rate": 1.002741278414069e-07,
      "loss": 0.0756,
      "reward": 0.13385188579559326,
      "reward_std": 0.6874089986085892,
      "rewards/cosine_scaled_reward": 0.06692593172192574,
      "rewards/format_reward": 0.0,
      "step": 496
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2480.5694274902344,
      "epoch": 0.8517566409597258,
      "grad_norm": 0.2888961732387543,
      "kl": 0.0250244140625,
      "learning_rate": 1.0017544823184055e-07,
      "loss": 0.1087,
      "reward": 0.2151249535381794,
      "reward_std": 0.7869587689638138,
      "rewards/cosine_scaled_reward": 0.10756248049438,
      "rewards/format_reward": 0.0,
      "step": 497
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3000.1666259765625,
      "epoch": 0.8534704370179949,
      "grad_norm": 0.16567862033843994,
      "kl": 0.0272216796875,
      "learning_rate": 1.0009869243631952e-07,
      "loss": -0.0254,
      "reward": -0.33249833807349205,
      "reward_std": 0.48314109444618225,
      "rewards/cosine_scaled_reward": -0.16624917834997177,
      "rewards/format_reward": 0.0,
      "step": 498
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2385.5416870117188,
      "epoch": 0.8551842330762639,
      "grad_norm": 0.20905697345733643,
      "kl": 0.027862548828125,
      "learning_rate": 1.000438641958131e-07,
      "loss": 0.017,
      "reward": 0.046394890174269676,
      "reward_std": 0.649334043264389,
      "rewards/cosine_scaled_reward": 0.02319744322448969,
      "rewards/format_reward": 0.0,
      "step": 499
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1828.2777862548828,
      "epoch": 0.856898029134533,
      "grad_norm": 0.25425171852111816,
      "kl": 0.0164031982421875,
      "learning_rate": 1.0001096618257236e-07,
      "loss": -0.0015,
      "reward": 0.1327105201780796,
      "reward_std": 0.5622994378209114,
      "rewards/cosine_scaled_reward": 0.06635526567697525,
      "rewards/format_reward": 0.0,
      "step": 500
    },
    {
      "epoch": 0.856898029134533,
      "step": 500,
      "total_flos": 0.0,
      "train_loss": 0.01978990149567835,
      "train_runtime": 91059.0796,
      "train_samples_per_second": 0.395,
      "train_steps_per_second": 0.005
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 6,
  "trial_name": null,
  "trial_params": null
}