{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 500,
  "global_step": 1620,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 867.0,
      "completions/max_terminated_length": 867.0,
      "completions/mean_length": 547.625,
      "completions/mean_terminated_length": 547.625,
      "completions/min_length": 292.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 0.0030864197530864196,
      "grad_norm": 4.879990798151839,
      "kl": NaN,
      "learning_rate": 0.0,
      "loss": -0.1871,
      "num_tokens": 24340.0,
      "reward": 0.003167829941958189,
      "reward_std": 0.006335659883916378,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 4.7588691813871264e-05,
      "rewards/logprob_reward/std": 0.0002692022826522589,
      "step": 1
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 784.0,
      "completions/mean_length": 523.3125,
      "completions/mean_terminated_length": 507.1612854003906,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.006172839506172839,
      "grad_norm": 6.645700899096121,
      "kl": NaN,
      "learning_rate": 1.020408163265306e-08,
      "loss": -0.3743,
      "num_tokens": 47462.0,
      "reward": 0.0063501279801130295,
      "reward_std": 0.012700255960226059,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00011125343735329807,
      "rewards/logprob_reward/std": 0.0004910404095426202,
      "step": 2
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 551.0,
      "completions/mean_terminated_length": 535.741943359375,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 0.009259259259259259,
      "grad_norm": 0.00038052978130953183,
      "kl": NaN,
      "learning_rate": 2.040816326530612e-08,
      "loss": 0.0,
      "num_tokens": 71866.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 3
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 532.75,
      "completions/mean_terminated_length": 500.0000305175781,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.012345679012345678,
      "grad_norm": 0.0005572360154994931,
      "kl": NaN,
      "learning_rate": 3.0612244897959183e-08,
      "loss": 0.0,
      "num_tokens": 95878.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 4
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 777.0,
      "completions/max_terminated_length": 777.0,
      "completions/mean_length": 536.03125,
      "completions/mean_terminated_length": 536.03125,
      "completions/min_length": 306.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 0.015432098765432098,
      "grad_norm": 10.61183558134393,
      "kl": NaN,
      "learning_rate": 4.081632653061224e-08,
      "loss": -0.3743,
      "num_tokens": 119719.0,
      "reward": 0.0064407894387841225,
      "reward_std": 0.012881578877568245,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00021198844478931278,
      "rewards/logprob_reward/std": 0.0008489831234328449,
      "step": 5
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 879.0,
      "completions/max_terminated_length": 879.0,
      "completions/mean_length": 544.4375,
      "completions/mean_terminated_length": 544.4375,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.018518518518518517,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 5.1020408163265303e-08,
      "loss": 0.0,
      "num_tokens": 143649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 6
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 794.0,
      "completions/max_terminated_length": 794.0,
      "completions/mean_length": 502.5,
      "completions/mean_terminated_length": 502.5,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.021604938271604937,
      "grad_norm": 7.27543371690155,
      "kl": NaN,
      "learning_rate": 6.122448979591837e-08,
      "loss": -0.3743,
      "num_tokens": 166089.0,
      "reward": 0.006321371532976627,
      "reward_std": 0.012642743065953255,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 7.93015060480684e-05,
      "rewards/logprob_reward/std": 0.0003123948990833014,
      "step": 7
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 769.0,
      "completions/mean_length": 524.78125,
      "completions/mean_terminated_length": 508.6773986816406,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 0.024691358024691357,
      "grad_norm": 9.211364593756338,
      "kl": NaN,
      "learning_rate": 7.142857142857142e-08,
      "loss": -0.5506,
      "num_tokens": 189406.0,
      "reward": 0.006642765365540981,
      "reward_std": 0.013285531662404537,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00043640637886710465,
      "rewards/logprob_reward/std": 0.0014902930706739426,
      "step": 8
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 700.0,
      "completions/max_terminated_length": 700.0,
      "completions/mean_length": 463.84375,
      "completions/mean_terminated_length": 463.84375,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.027777777777777776,
      "grad_norm": 4.750598646530187,
      "kl": NaN,
      "learning_rate": 8.163265306122448e-08,
      "loss": -0.1871,
      "num_tokens": 210493.0,
      "reward": 0.0031913958955556154,
      "reward_std": 0.0063827913254499435,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 7.377310248557478e-05,
      "rewards/logprob_reward/std": 0.00041732366662472486,
      "step": 9
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 970.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 551.4375,
      "completions/mean_terminated_length": 551.4375,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.030864197530864196,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 9.183673469387755e-08,
      "loss": 0.0,
      "num_tokens": 234415.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 10
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 874.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 512.84375,
      "completions/mean_terminated_length": 512.84375,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.033950617283950615,
      "grad_norm": 5.585638064770051,
      "kl": NaN,
      "learning_rate": 1.0204081632653061e-07,
      "loss": -0.1871,
      "num_tokens": 257178.0,
      "reward": 0.003199605271220207,
      "reward_std": 0.006399210542440414,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 8.289470861200243e-05,
      "rewards/logprob_reward/std": 0.0004689233028329909,
      "step": 11
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 686.0,
      "completions/mean_length": 503.375,
      "completions/mean_terminated_length": 468.66668701171875,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 0.037037037037037035,
      "grad_norm": 0.0008072613180261669,
      "kl": NaN,
      "learning_rate": 1.1224489795918366e-07,
      "loss": 0.0,
      "num_tokens": 279898.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 12
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 570.625,
      "completions/mean_terminated_length": 556.0,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.040123456790123455,
      "grad_norm": 0.00046137066571460666,
      "kl": NaN,
      "learning_rate": 1.2244897959183673e-07,
      "loss": 0.0,
      "num_tokens": 304566.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 13
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 598.75,
      "completions/mean_terminated_length": 585.0322265625,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 0.043209876543209874,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 1.326530612244898e-07,
      "loss": 0.0,
      "num_tokens": 330450.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 14
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 565.15625,
      "completions/mean_terminated_length": 534.5667114257812,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.046296296296296294,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 1.4285714285714285e-07,
      "loss": 0.0,
      "num_tokens": 355123.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 15
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 542.25,
      "completions/mean_terminated_length": 526.7096557617188,
      "completions/min_length": 283.0,
      "completions/min_terminated_length": 283.0,
      "epoch": 0.04938271604938271,
      "grad_norm": 4.36675829873075,
      "kl": NaN,
      "learning_rate": 1.5306122448979592e-07,
      "loss": -0.275,
      "num_tokens": 378955.0,
      "reward": 0.003193480661138892,
      "reward_std": 0.006386961322277784,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 7.608939631609246e-05,
      "rewards/logprob_reward/std": 0.0003962118935305625,
      "step": 16
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 553.59375,
      "completions/mean_terminated_length": 522.2333374023438,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.05246913580246913,
      "grad_norm": 4.919935265122631,
      "kl": NaN,
      "learning_rate": 1.6326530612244896e-07,
      "loss": -0.1871,
      "num_tokens": 403382.0,
      "reward": 0.0031280089169740677,
      "reward_std": 0.006256017833948135,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 3.3431635984015884e-06,
      "rewards/logprob_reward/std": 1.8911789084086195e-05,
      "step": 17
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 594.3125,
      "completions/mean_terminated_length": 549.862060546875,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 0.05555555555555555,
      "grad_norm": 7.433026661740928,
      "kl": NaN,
      "learning_rate": 1.7346938775510203e-07,
      "loss": -0.1871,
      "num_tokens": 428960.0,
      "reward": 0.0031758369877934456,
      "reward_std": 0.006351673975586891,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 5.648556907544844e-05,
      "rewards/logprob_reward/std": 0.0003195306344423443,
      "step": 18
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 756.0,
      "completions/mean_length": 523.5,
      "completions/mean_terminated_length": 490.13336181640625,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 0.05864197530864197,
      "grad_norm": 4.74937187483194,
      "kl": NaN,
      "learning_rate": 1.836734693877551e-07,
      "loss": -0.1871,
      "num_tokens": 452024.0,
      "reward": 0.003166038077324629,
      "reward_std": 0.006332076154649258,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 4.559782973956317e-05,
      "rewards/logprob_reward/std": 0.00025794029352255166,
      "step": 19
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 856.0,
      "completions/mean_length": 528.875,
      "completions/mean_terminated_length": 495.86669921875,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.06172839506172839,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 1.9387755102040814e-07,
      "loss": 0.0,
      "num_tokens": 475340.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 20
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 553.65625,
      "completions/mean_terminated_length": 522.300048828125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.06481481481481481,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 2.0408163265306121e-07,
      "loss": 0.0,
      "num_tokens": 499329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 21
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 716.0,
      "completions/mean_length": 511.375,
      "completions/mean_terminated_length": 494.83868408203125,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.06790123456790123,
      "grad_norm": 5.039991769823507,
      "kl": NaN,
      "learning_rate": 2.1428571428571426e-07,
      "loss": -0.1871,
      "num_tokens": 521813.0,
      "reward": 0.003222013358026743,
      "reward_std": 0.006444026716053486,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00010779264266602695,
      "rewards/logprob_reward/std": 0.0006097672739997506,
      "step": 22
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 935.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 515.0,
      "completions/mean_terminated_length": 515.0,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "epoch": 0.07098765432098765,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 2.2448979591836733e-07,
      "loss": 0.0,
      "num_tokens": 545069.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 23
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 494.125,
      "completions/mean_terminated_length": 477.0322570800781,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.07407407407407407,
      "grad_norm": 10.389645085374694,
      "kl": NaN,
      "learning_rate": 2.346938775510204e-07,
      "loss": -0.3743,
      "num_tokens": 567493.0,
      "reward": 0.006342649459838867,
      "reward_std": 0.012685298919677734,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00010294403182342649,
      "rewards/logprob_reward/std": 0.0004805905919056386,
      "step": 24
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 584.75,
      "completions/mean_terminated_length": 483.3846435546875,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 0.07716049382716049,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 2.4489795918367347e-07,
      "loss": 0.0,
      "num_tokens": 593321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 25
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 537.15625,
      "completions/mean_terminated_length": 521.4515991210938,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.08024691358024691,
      "grad_norm": 2.673643403308418,
      "kl": NaN,
      "learning_rate": 2.551020408163265e-07,
      "loss": -0.0624,
      "num_tokens": 617334.0,
      "reward": 0.00964600034058094,
      "reward_std": 0.006430668756365776,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.00030111195519566536,
      "rewards/logprob_reward/std": 0.0009512502583675086,
      "step": 26
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 496.03125,
      "completions/mean_terminated_length": 479.0,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.08333333333333333,
      "grad_norm": 5.735671573618509,
      "kl": NaN,
      "learning_rate": 2.653061224489796e-07,
      "loss": -0.1871,
      "num_tokens": 639619.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 27
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 537.75,
      "completions/mean_terminated_length": 522.0645141601562,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0.08641975308641975,
      "grad_norm": 9.310152593066146,
      "kl": NaN,
      "learning_rate": 2.755102040816326e-07,
      "loss": -0.3743,
      "num_tokens": 663439.0,
      "reward": 0.006380573846399784,
      "reward_std": 0.012761147692799568,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00014508160529658198,
      "rewards/logprob_reward/std": 0.0007550088339485228,
      "step": 28
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 494.53125,
      "completions/mean_terminated_length": 477.45159912109375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "epoch": 0.08950617283950617,
      "grad_norm": 4.434418341643034,
      "kl": NaN,
      "learning_rate": 2.857142857142857e-07,
      "loss": -0.1871,
      "num_tokens": 685388.0,
      "reward": 0.003202674211934209,
      "reward_std": 0.006405348423868418,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 8.63045861478895e-05,
      "rewards/logprob_reward/std": 0.00048821247764863074,
      "step": 29
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 750.0,
      "completions/mean_length": 608.9375,
      "completions/mean_terminated_length": 595.54833984375,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 0.09259259259259259,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 2.9591836734693874e-07,
      "loss": 0.0,
      "num_tokens": 711710.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 30
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1020.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 531.59375,
      "completions/mean_terminated_length": 531.59375,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "epoch": 0.09567901234567901,
      "grad_norm": 4.5768042220847605,
      "kl": NaN,
      "learning_rate": 3.0612244897959183e-07,
      "loss": -0.1871,
      "num_tokens": 735193.0,
      "reward": 0.0031982893124222755,
      "reward_std": 0.006396578624844551,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 8.14325176179409e-05,
      "rewards/logprob_reward/std": 0.00046065187780186534,
      "step": 31
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 882.0,
      "completions/max_terminated_length": 882.0,
      "completions/mean_length": 492.46875,
      "completions/mean_terminated_length": 492.46875,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.09876543209876543,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 3.163265306122449e-07,
      "loss": 0.0,
      "num_tokens": 757452.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 32
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 819.0,
      "completions/mean_length": 552.40625,
      "completions/mean_terminated_length": 520.9666748046875,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.10185185185185185,
      "grad_norm": 5.73460587413631,
      "kl": NaN,
      "learning_rate": 3.265306122448979e-07,
      "loss": -0.1871,
      "num_tokens": 781945.0,
      "reward": 0.0032463932875543833,
      "reward_std": 0.0064927865751087666,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00013488148397300392,
      "rewards/logprob_reward/std": 0.0007630048785358667,
      "step": 33
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 501.4375,
      "completions/mean_terminated_length": 484.58062744140625,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.10493827160493827,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 3.3673469387755096e-07,
      "loss": 0.0,
      "num_tokens": 804735.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 34
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 908.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 514.40625,
      "completions/mean_terminated_length": 514.40625,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.10802469135802469,
      "grad_norm": 6.87957308311849,
      "kl": NaN,
      "learning_rate": 3.4693877551020406e-07,
      "loss": -0.3743,
      "num_tokens": 827436.0,
      "reward": 0.006365813780575991,
      "reward_std": 0.012731627561151981,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00012868200428783894,
      "rewards/logprob_reward/std": 0.0005134922685101628,
      "step": 35
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 638.78125,
      "completions/mean_terminated_length": 530.9199829101562,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.1111111111111111,
      "grad_norm": 5.874910559307383,
      "kl": NaN,
      "learning_rate": 3.5714285714285716e-07,
      "loss": -0.347,
      "num_tokens": 854661.0,
      "reward": 0.0031954434234648943,
      "reward_std": 0.006390886846929789,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 7.827053195796907e-05,
      "rewards/logprob_reward/std": 0.0003081193717662245,
      "step": 36
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 532.90625,
      "completions/mean_terminated_length": 500.16668701171875,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.11419753086419752,
      "grad_norm": 7.681843992448197,
      "kl": NaN,
      "learning_rate": 3.673469387755102e-07,
      "loss": -0.3743,
      "num_tokens": 878670.0,
      "reward": 0.006405107211321592,
      "reward_std": 0.012810214422643185,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0001723413442960009,
      "rewards/logprob_reward/std": 0.0007948004058562219,
      "step": 37
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 519.6875,
      "completions/mean_terminated_length": 503.4193420410156,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 0.11728395061728394,
      "grad_norm": 1.9184316592124007,
      "kl": NaN,
      "learning_rate": 3.7755102040816324e-07,
      "loss": -0.0545,
      "num_tokens": 901708.0,
      "reward": 0.0031901041511446238,
      "reward_std": 0.006277103908360004,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 7.233808719320223e-05,
      "rewards/logprob_reward/std": 0.0002902286360040307,
      "step": 38
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 615.34375,
      "completions/mean_terminated_length": 573.0689697265625,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "epoch": 0.12037037037037036,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 3.877551020408163e-07,
      "loss": 0.0,
      "num_tokens": 928267.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 39
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 578.4375,
      "completions/mean_terminated_length": 495.9259338378906,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 0.12345679012345678,
      "grad_norm": 0.00354366427333239,
      "kl": NaN,
      "learning_rate": 3.979591836734694e-07,
      "loss": 0.0,
      "num_tokens": 953173.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 40
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 525.5625,
      "completions/mean_terminated_length": 474.0,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 0.12654320987654322,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 4.0816326530612243e-07,
      "loss": 0.0,
      "num_tokens": 976395.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 41
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 561.1875,
      "completions/mean_terminated_length": 530.3333740234375,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.12962962962962962,
      "grad_norm": 5.452052063608532,
      "kl": NaN,
      "learning_rate": 4.183673469387755e-07,
      "loss": -0.3547,
      "num_tokens": 1001085.0,
      "reward": 0.0031829094514250755,
      "reward_std": 0.006365818902850151,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 6.434385431930423e-05,
      "rewards/logprob_reward/std": 0.0003311107575427741,
      "step": 42
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 634.0,
      "completions/mean_length": 524.4375,
      "completions/mean_terminated_length": 453.0714416503906,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 0.13271604938271606,
      "grad_norm": 8.882324796707932,
      "kl": NaN,
      "learning_rate": 4.285714285714285e-07,
      "loss": -0.3549,
      "num_tokens": 1024631.0,
      "reward": 0.003180807689204812,
      "reward_std": 0.006361615378409624,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 6.200830830493942e-05,
      "rewards/logprob_reward/std": 0.0003339833638165146,
      "step": 43
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 738.0,
      "completions/max_terminated_length": 738.0,
      "completions/mean_length": 509.21875,
      "completions/mean_terminated_length": 509.21875,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.13580246913580246,
      "grad_norm": 6.3561378563385595,
      "kl": NaN,
      "learning_rate": 4.387755102040816e-07,
      "loss": -0.3568,
      "num_tokens": 1047458.0,
      "reward": 0.018970444798469543,
      "reward_std": 0.019943157210946083,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.00024493783712387085,
      "rewards/logprob_reward/std": 0.000614454853348434,
      "step": 44
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 832.0,
      "completions/max_terminated_length": 832.0,
      "completions/mean_length": 450.90625,
      "completions/mean_terminated_length": 450.90625,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "epoch": 0.1388888888888889,
      "grad_norm": 6.723842234335574,
      "kl": NaN,
      "learning_rate": 4.4897959183673465e-07,
      "loss": -0.3743,
      "num_tokens": 1067883.0,
      "reward": 0.006346043664962053,
      "reward_std": 0.012692087329924107,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00010671508789528161,
      "rewards/logprob_reward/std": 0.0005800679209642112,
      "step": 45
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 689.0,
      "completions/max_terminated_length": 689.0,
      "completions/mean_length": 411.90625,
      "completions/mean_terminated_length": 411.90625,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.1419753086419753,
      "grad_norm": 2.3938271409190297,
      "kl": NaN,
      "learning_rate": 4.5918367346938775e-07,
      "loss": -0.1081,
      "num_tokens": 1087052.0,
      "reward": 0.006382983643561602,
      "reward_std": 0.007370436564087868,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00014775953604839742,
      "rewards/logprob_reward/std": 0.0005816027987748384,
      "step": 46
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 539.53125,
      "completions/mean_terminated_length": 489.4137878417969,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.14506172839506173,
      "grad_norm": 8.741826559168924,
      "kl": NaN,
      "learning_rate": 4.693877551020408e-07,
      "loss": -0.5614,
      "num_tokens": 1110633.0,
      "reward": 0.009479801170527935,
      "reward_std": 0.01895960234105587,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.00011644529149634764,
      "rewards/logprob_reward/std": 0.00039107364136725664,
      "step": 47
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 839.0,
      "completions/max_terminated_length": 839.0,
      "completions/mean_length": 479.3125,
      "completions/mean_terminated_length": 479.3125,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 0.14814814814814814,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 4.795918367346938e-07,
      "loss": 0.0,
      "num_tokens": 1132355.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 48
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 906.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 535.4375,
      "completions/mean_terminated_length": 535.4375,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.15123456790123457,
      "grad_norm": 6.8910251613189395,
      "kl": NaN,
      "learning_rate": 4.897959183673469e-07,
      "loss": -0.3743,
      "num_tokens": 1156073.0,
      "reward": 0.0063034119084477425,
      "reward_std": 0.012606823816895485,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 5.934640648774803e-05,
      "rewards/logprob_reward/std": 0.00023372126452159137,
      "step": 49
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 620.0,
      "completions/mean_length": 470.75,
      "completions/mean_terminated_length": 433.86669921875,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.15432098765432098,
      "grad_norm": 10.199205303249721,
      "kl": NaN,
      "learning_rate": 5e-07,
      "loss": -0.7485,
      "num_tokens": 1177205.0,
      "reward": 0.012633386999368668,
      "reward_std": 0.025266773998737335,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0001482078223489225,
      "rewards/logprob_reward/std": 0.00046398723497986794,
      "step": 50
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 526.71875,
      "completions/mean_terminated_length": 493.5666809082031,
      "completions/min_length": 252.0,
      "completions/min_terminated_length": 252.0,
      "epoch": 0.1574074074074074,
      "grad_norm": 22.069679942677325,
      "kl": NaN,
      "learning_rate": 4.999995001298037e-07,
      "loss": -0.4714,
      "num_tokens": 1200696.0,
      "reward": 0.009701208211481571,
      "reward_std": 0.014054418541491032,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.00036245345836505294,
      "rewards/logprob_reward/std": 0.001145550631918013,
      "step": 51
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 528.84375,
      "completions/mean_terminated_length": 477.6206970214844,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.16049382716049382,
      "grad_norm": 14.415036849323204,
      "kl": NaN,
      "learning_rate": 4.99998000521214e-07,
      "loss": -0.6278,
      "num_tokens": 1224303.0,
      "reward": 0.012864849530160427,
      "reward_std": 0.025729699060320854,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.00040538786561228335,
      "rewards/logprob_reward/std": 0.00129931780975312,
      "step": 52
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 755.0,
      "completions/mean_length": 547.71875,
      "completions/mean_terminated_length": 479.6785888671875,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.16358024691358025,
      "grad_norm": 5.236887534241916,
      "kl": NaN,
      "learning_rate": 4.999955011802275e-07,
      "loss": -0.2955,
      "num_tokens": 1248478.0,
      "reward": 0.010162696242332458,
      "reward_std": 0.014412381686270237,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0008752185967750847,
      "rewards/logprob_reward/std": 0.003261502366513014,
      "step": 53
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 497.0625,
      "completions/mean_terminated_length": 480.06451416015625,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.16666666666666666,
      "grad_norm": 8.421637357342595,
      "kl": NaN,
      "learning_rate": 4.999920021168393e-07,
      "loss": -0.466,
      "num_tokens": 1271212.0,
      "reward": 0.009957539848983288,
      "reward_std": 0.019376683980226517,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0006472665118053555,
      "rewards/logprob_reward/std": 0.002017608843743801,
      "step": 54
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 475.65625,
      "completions/mean_terminated_length": 457.9677429199219,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.1697530864197531,
      "grad_norm": 14.131655360271075,
      "kl": NaN,
      "learning_rate": 4.999875033450417e-07,
      "loss": -0.5614,
      "num_tokens": 1292665.0,
      "reward": 0.009479843080043793,
      "reward_std": 0.018959686160087585,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.00011649253428913653,
      "rewards/logprob_reward/std": 0.0005409479490481317,
      "step": 55
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 850.0,
      "completions/mean_length": 504.25,
      "completions/mean_terminated_length": 469.60003662109375,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0.1728395061728395,
      "grad_norm": 9.585090906952301,
      "kl": NaN,
      "learning_rate": 4.999820048828253e-07,
      "loss": -0.7972,
      "num_tokens": 1315305.0,
      "reward": 0.016101833432912827,
      "reward_std": 0.031882334500551224,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0005298148025758564,
      "rewards/logprob_reward/std": 0.0013958527706563473,
      "step": 56
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 515.96875,
      "completions/mean_terminated_length": 499.58062744140625,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.17592592592592593,
      "grad_norm": 13.522680321834711,
      "kl": NaN,
      "learning_rate": 4.999755067521781e-07,
      "loss": -0.7486,
      "num_tokens": 1338224.0,
      "reward": 0.013169733807444572,
      "reward_std": 0.026339467614889145,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0007441485649906099,
      "rewards/logprob_reward/std": 0.0030354137998074293,
      "step": 57
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 711.0,
      "completions/mean_length": 474.125,
      "completions/mean_terminated_length": 456.3870849609375,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.17901234567901234,
      "grad_norm": 8.343067177520528,
      "kl": NaN,
      "learning_rate": 4.999680089790861e-07,
      "loss": -0.7161,
      "num_tokens": 1359760.0,
      "reward": 0.02902640402317047,
      "reward_std": 0.03867201507091522,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0010015605948865414,
      "rewards/logprob_reward/std": 0.002976895309984684,
      "step": 58
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 565.59375,
      "completions/mean_terminated_length": 500.107177734375,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.18209876543209877,
      "grad_norm": 3.4895631441555404,
      "kl": NaN,
      "learning_rate": 4.999595115935325e-07,
      "loss": -0.2656,
      "num_tokens": 1384755.0,
      "reward": 0.02569853514432907,
      "reward_std": 0.012860770337283611,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0007761501474305987,
      "rewards/logprob_reward/std": 0.0013694728258997202,
      "step": 59
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 653.0,
      "completions/max_terminated_length": 653.0,
      "completions/mean_length": 463.71875,
      "completions/mean_terminated_length": 463.71875,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 0.18518518518518517,
      "grad_norm": 5.676083938080832,
      "kl": NaN,
      "learning_rate": 4.999500146294979e-07,
      "loss": -0.539,
      "num_tokens": 1406030.0,
      "reward": 0.03294990956783295,
      "reward_std": 0.033797409385442734,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0018887876067310572,
      "rewards/logprob_reward/std": 0.004393962677568197,
      "step": 60
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 782.0,
      "completions/mean_length": 543.5625,
      "completions/mean_terminated_length": 454.59259033203125,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 0.1882716049382716,
      "grad_norm": 8.804184607283121,
      "kl": NaN,
      "learning_rate": 4.999395181249604e-07,
      "loss": -0.635,
      "num_tokens": 1429828.0,
      "reward": 0.0346556231379509,
      "reward_std": 0.03339875489473343,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.000311802898067981,
      "rewards/logprob_reward/std": 0.0005382333183661103,
      "step": 61
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 757.0,
      "completions/mean_length": 478.71875,
      "completions/mean_terminated_length": 442.36669921875,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "epoch": 0.19135802469135801,
      "grad_norm": 5.5555862620704595,
      "kl": NaN,
      "learning_rate": 4.99928022121895e-07,
      "loss": -0.4251,
      "num_tokens": 1451363.0,
      "reward": 0.02836659923195839,
      "reward_std": 0.03340596705675125,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.00026844331296160817,
      "rewards/logprob_reward/std": 0.0006385487504303455,
      "step": 62
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 831.0,
      "completions/max_terminated_length": 831.0,
      "completions/mean_length": 451.0,
      "completions/mean_terminated_length": 451.0,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 0.19444444444444445,
      "grad_norm": 9.370034003411387,
      "kl": NaN,
      "learning_rate": 4.99915526666274e-07,
      "loss": -0.948,
      "num_tokens": 1471991.0,
      "reward": 0.037867240607738495,
      "reward_std": 0.04707539454102516,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0004080414946656674,
      "rewards/logprob_reward/std": 0.0006741120596416295,
      "step": 63
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 866.0,
      "completions/mean_length": 584.40625,
      "completions/mean_terminated_length": 521.607177734375,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.19753086419753085,
      "grad_norm": 12.851748881128868,
      "kl": NaN,
      "learning_rate": 4.999020318080661e-07,
      "loss": -0.9943,
      "num_tokens": 1497096.0,
      "reward": 0.02561819739639759,
      "reward_std": 0.045522771775722504,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0006868854979984462,
      "rewards/logprob_reward/std": 0.0014965421287342906,
      "step": 64
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 786.0,
      "completions/mean_length": 463.59375,
      "completions/mean_terminated_length": 445.51611328125,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "epoch": 0.2006172839506173,
      "grad_norm": 7.583142573219291,
      "kl": NaN,
      "learning_rate": 4.998875376012368e-07,
      "loss": -0.6527,
      "num_tokens": 1518103.0,
      "reward": 0.034932032227516174,
      "reward_std": 0.03384808078408241,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0006189263658598065,
      "rewards/logprob_reward/std": 0.0014778337208554149,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 474.03125,
      "completions/mean_terminated_length": 437.36669921875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "epoch": 0.2037037037037037,
      "grad_norm": 11.646201626453006,
      "kl": 0.01688385009765625,
      "learning_rate": 4.998720441037479e-07,
      "loss": -0.9368,
      "num_tokens": 1539704.0,
      "reward": 0.03542628139257431,
      "reward_std": 0.041238218545913696,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0011680902680382133,
      "rewards/logprob_reward/std": 0.0017735165311023593,
      "step": 66
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 666.0,
      "completions/mean_length": 451.3125,
      "completions/mean_terminated_length": 413.13336181640625,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "epoch": 0.20679012345679013,
      "grad_norm": 5.5349097190879695,
      "kl": NaN,
      "learning_rate": 4.99855551377557e-07,
      "loss": -0.5997,
      "num_tokens": 1560386.0,
      "reward": 0.04110191762447357,
      "reward_std": 0.047965094447135925,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0005299084004946053,
      "rewards/logprob_reward/std": 0.0007469018455594778,
      "step": 67
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 714.0,
      "completions/mean_length": 508.5625,
      "completions/mean_terminated_length": 434.9285888671875,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "epoch": 0.20987654320987653,
      "grad_norm": 6.043545392642072,
      "kl": NaN,
      "learning_rate": 4.998380594886182e-07,
      "loss": -0.4584,
      "num_tokens": 1583228.0,
      "reward": 0.0408533550798893,
      "reward_std": 0.02602376975119114,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0002537275431677699,
      "rewards/logprob_reward/std": 0.0005096375825814903,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 811.0,
      "completions/mean_length": 537.4375,
      "completions/mean_terminated_length": 467.9285888671875,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.21296296296296297,
      "grad_norm": 9.025643600318977,
      "kl": 0.01786041259765625,
      "learning_rate": 4.998195685068808e-07,
      "loss": -0.836,
      "num_tokens": 1606838.0,
      "reward": 0.03815925866365433,
      "reward_std": 0.05255364254117012,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0007325104670599103,
      "rewards/logprob_reward/std": 0.001503912266343832,
      "step": 69
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 467.65625,
      "completions/mean_terminated_length": 430.5666809082031,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.21604938271604937,
      "grad_norm": 15.21712081526465,
      "kl": NaN,
      "learning_rate": 4.998000785062895e-07,
      "loss": -0.9813,
      "num_tokens": 1628447.0,
      "reward": 0.038099028170108795,
      "reward_std": 0.04530385509133339,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0006655875477008522,
      "rewards/logprob_reward/std": 0.001207419903948903,
      "step": 70
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 619.0,
      "completions/mean_length": 434.875,
      "completions/mean_terminated_length": 395.6000061035156,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "epoch": 0.2191358024691358,
      "grad_norm": 6.228276737398428,
      "kl": NaN,
      "learning_rate": 4.997795895647841e-07,
      "loss": -0.4687,
      "num_tokens": 1649051.0,
      "reward": 0.03813503682613373,
      "reward_std": 0.03481914848089218,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0007055974565446377,
      "rewards/logprob_reward/std": 0.0011948143364861608,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 744.0,
      "completions/max_terminated_length": 744.0,
      "completions/mean_length": 447.21875,
      "completions/mean_terminated_length": 447.21875,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.2222222222222222,
      "grad_norm": 6.123223944953748,
      "kl": 0.01909637451171875,
      "learning_rate": 4.997581017642991e-07,
      "loss": -0.536,
      "num_tokens": 1669838.0,
      "reward": 0.06365455687046051,
      "reward_std": 0.04029175266623497,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0012828335165977478,
      "rewards/logprob_reward/std": 0.0024304192047566175,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 505.0,
      "completions/mean_terminated_length": 488.258056640625,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.22530864197530864,
      "grad_norm": 4.045335637387813,
      "kl": 0.0219268798828125,
      "learning_rate": 4.997356151907633e-07,
      "loss": -0.3864,
      "num_tokens": 1692502.0,
      "reward": 0.05487312376499176,
      "reward_std": 0.04867660999298096,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0019423530902713537,
      "rewards/logprob_reward/std": 0.0025331033393740654,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 742.0,
      "completions/mean_length": 557.15625,
      "completions/mean_terminated_length": 508.862060546875,
      "completions/min_length": 306.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 0.22839506172839505,
      "grad_norm": 6.256315480758599,
      "kl": 0.0246429443359375,
      "learning_rate": 4.997121299340997e-07,
      "loss": -0.6425,
      "num_tokens": 1716799.0,
      "reward": 0.054621484130620956,
      "reward_std": 0.05373973399400711,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0016627591103315353,
      "rewards/logprob_reward/std": 0.003260746132582426,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 609.0,
      "completions/mean_length": 469.5625,
      "completions/mean_terminated_length": 451.6773986816406,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.23148148148148148,
      "grad_norm": 5.3090812476367875,
      "kl": 0.02172088623046875,
      "learning_rate": 4.99687646088225e-07,
      "loss": -0.644,
      "num_tokens": 1737913.0,
      "reward": 0.05796758458018303,
      "reward_std": 0.05293092876672745,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0019084264058619738,
      "rewards/logprob_reward/std": 0.0030874432995915413,
      "step": 75
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 929.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 506.3125,
      "completions/mean_terminated_length": 506.3125,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.2345679012345679,
      "grad_norm": 3.09483623935045,
      "kl": NaN,
      "learning_rate": 4.996621637510491e-07,
      "loss": -0.2407,
      "num_tokens": 1760615.0,
      "reward": 0.0681268572807312,
      "reward_std": 0.0321149118244648,
      "rewards/format_reward_func/mean": 0.65625,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0027798402588814497,
      "rewards/logprob_reward/std": 0.0033658831380307674,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 882.0,
      "completions/max_terminated_length": 882.0,
      "completions/mean_length": 479.40625,
      "completions/mean_terminated_length": 479.40625,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "epoch": 0.23765432098765432,
      "grad_norm": 6.022043154609086,
      "kl": 0.0325927734375,
      "learning_rate": 4.996356830244749e-07,
      "loss": -0.3437,
      "num_tokens": 1782048.0,
      "reward": 0.06664453446865082,
      "reward_std": 0.03348292410373688,
      "rewards/format_reward_func/mean": 0.65625,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0011328145628795028,
      "rewards/logprob_reward/std": 0.0022796066477894783,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 599.0,
      "completions/mean_length": 444.90625,
      "completions/mean_terminated_length": 406.3000183105469,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "epoch": 0.24074074074074073,
      "grad_norm": 10.690746280967904,
      "kl": 0.029754638671875,
      "learning_rate": 4.996082040143977e-07,
      "loss": -0.4602,
      "num_tokens": 1802945.0,
      "reward": 0.07090184837579727,
      "reward_std": 0.03920045495033264,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0023909390438348055,
      "rewards/logprob_reward/std": 0.0038968671578913927,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 526.4375,
      "completions/mean_terminated_length": 474.96551513671875,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.24382716049382716,
      "grad_norm": 5.942169315397655,
      "kl": 0.02519989013671875,
      "learning_rate": 4.995797268307051e-07,
      "loss": -0.3951,
      "num_tokens": 1826151.0,
      "reward": 0.04450929909944534,
      "reward_std": 0.047439657151699066,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0008436637581326067,
      "rewards/logprob_reward/std": 0.0016849666135385633,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 780.0,
      "completions/mean_length": 609.8125,
      "completions/mean_terminated_length": 471.75,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "epoch": 0.24691358024691357,
      "grad_norm": 7.553668248560938,
      "kl": 0.0283966064453125,
      "learning_rate": 4.995502515872763e-07,
      "loss": -0.5494,
      "num_tokens": 1852245.0,
      "reward": 0.04472878575325012,
      "reward_std": 0.050996411591768265,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0010875379666686058,
      "rewards/logprob_reward/std": 0.0020124169532209635,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 526.0625,
      "completions/mean_terminated_length": 510.0,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.25,
      "grad_norm": 6.008576594263075,
      "kl": 0.026123046875,
      "learning_rate": 4.995197784019818e-07,
      "loss": -0.2975,
      "num_tokens": 1875791.0,
      "reward": 0.060693349689245224,
      "reward_std": 0.04175513982772827,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0014648307114839554,
      "rewards/logprob_reward/std": 0.002707106526941061,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 541.75,
      "completions/mean_terminated_length": 509.60003662109375,
      "completions/min_length": 278.0,
      "completions/min_terminated_length": 278.0,
      "epoch": 0.25308641975308643,
      "grad_norm": 5.849176959121505,
      "kl": 0.028839111328125,
      "learning_rate": 4.994883073966823e-07,
      "loss": -0.3656,
      "num_tokens": 1899519.0,
      "reward": 0.060538940131664276,
      "reward_std": 0.04685738682746887,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0012932643294334412,
      "rewards/logprob_reward/std": 0.0021190636325627565,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 810.0,
      "completions/mean_length": 468.125,
      "completions/mean_terminated_length": 450.19354248046875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.25617283950617287,
      "grad_norm": 4.10962518518028,
      "kl": 0.0347137451171875,
      "learning_rate": 4.994558386972295e-07,
      "loss": -0.1882,
      "num_tokens": 1920863.0,
      "reward": 0.05730611830949783,
      "reward_std": 0.04541774466633797,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0011734651634469628,
      "rewards/logprob_reward/std": 0.002099178498610854,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 647.03125,
      "completions/mean_terminated_length": 608.0344848632812,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 0.25925925925925924,
      "grad_norm": 3.286319365867372,
      "kl": 0.02838897705078125,
      "learning_rate": 4.994223724334643e-07,
      "loss": -0.2404,
      "num_tokens": 1948432.0,
      "reward": 0.05551876127719879,
      "reward_std": 0.05297388881444931,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002659732010215521,
      "rewards/logprob_reward/std": 0.004493256099522114,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 737.0,
      "completions/mean_length": 543.53125,
      "completions/mean_terminated_length": 493.82757568359375,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.2623456790123457,
      "grad_norm": 4.224829549134744,
      "kl": 0.0585479736328125,
      "learning_rate": 4.99387908739217e-07,
      "loss": -0.2441,
      "num_tokens": 1972285.0,
      "reward": 0.053791362792253494,
      "reward_std": 0.04617141932249069,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0007404016796499491,
      "rewards/logprob_reward/std": 0.0013021408813074231,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 593.53125,
      "completions/mean_terminated_length": 513.8148193359375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.2654320987654321,
      "grad_norm": 3.7137649235386068,
      "kl": 0.063995361328125,
      "learning_rate": 4.993524477523067e-07,
      "loss": -0.1313,
      "num_tokens": 1998098.0,
      "reward": 0.06119167059659958,
      "reward_std": 0.054584987461566925,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002018526429310441,
      "rewards/logprob_reward/std": 0.004277835600078106,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 706.0,
      "completions/mean_length": 485.96875,
      "completions/mean_terminated_length": 468.6128845214844,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.26851851851851855,
      "grad_norm": 3.694604424319856,
      "kl": 0.0387420654296875,
      "learning_rate": 4.993159896145405e-07,
      "loss": -0.2402,
      "num_tokens": 2020197.0,
      "reward": 0.054866090416908264,
      "reward_std": 0.05415533855557442,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0019345462787896395,
      "rewards/logprob_reward/std": 0.0032406121026724577,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 743.0,
      "completions/mean_length": 482.78125,
      "completions/mean_terminated_length": 465.32257080078125,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.2716049382716049,
      "grad_norm": 3.774088917632137,
      "kl": 0.041046142578125,
      "learning_rate": 4.99278534471713e-07,
      "loss": -0.2648,
      "num_tokens": 2042238.0,
      "reward": 0.06346947699785233,
      "reward_std": 0.04732219874858856,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0010771907400339842,
      "rewards/logprob_reward/std": 0.0016593115869909525,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 542.5,
      "completions/mean_terminated_length": 510.4000244140625,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.27469135802469136,
      "grad_norm": 3.347485424204535,
      "kl": 0.0376434326171875,
      "learning_rate": 4.992400824736059e-07,
      "loss": -0.3509,
      "num_tokens": 2065678.0,
      "reward": 0.05103076249361038,
      "reward_std": 0.047450192272663116,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0011452909093350172,
      "rewards/logprob_reward/std": 0.0024503832682967186,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 747.0,
      "completions/mean_length": 535.375,
      "completions/mean_terminated_length": 502.8000183105469,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.2777777777777778,
      "grad_norm": 7.001336144386259,
      "kl": 0.0478057861328125,
      "learning_rate": 4.992006337739874e-07,
      "loss": -0.4782,
      "num_tokens": 2089774.0,
      "reward": 0.05893455818295479,
      "reward_std": 0.043769750744104385,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0029828420374542475,
      "rewards/logprob_reward/std": 0.004484000615775585,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 867.0,
      "completions/mean_length": 519.96875,
      "completions/mean_terminated_length": 486.36669921875,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.2808641975308642,
      "grad_norm": 3.097845696323117,
      "kl": 0.0368804931640625,
      "learning_rate": 4.991601885306111e-07,
      "loss": -0.1096,
      "num_tokens": 2113281.0,
      "reward": 0.03542055934667587,
      "reward_std": 0.04836319386959076,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0011617304990068078,
      "rewards/logprob_reward/std": 0.0018826224841177464,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 545.28125,
      "completions/mean_terminated_length": 513.36669921875,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.2839506172839506,
      "grad_norm": 3.7743253756780084,
      "kl": 0.050323486328125,
      "learning_rate": 4.991187469052162e-07,
      "loss": -0.1952,
      "num_tokens": 2137282.0,
      "reward": 0.05702035129070282,
      "reward_std": 0.05189286172389984,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0008559462148696184,
      "rewards/logprob_reward/std": 0.0017882605316117406,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 501.53125,
      "completions/mean_terminated_length": 447.4827575683594,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.28703703703703703,
      "grad_norm": 3.8907350246801036,
      "kl": 0.04400634765625,
      "learning_rate": 4.99076309063526e-07,
      "loss": -0.1082,
      "num_tokens": 2159795.0,
      "reward": 0.051053136587142944,
      "reward_std": 0.045405298471450806,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.00117015209980309,
      "rewards/logprob_reward/std": 0.002091635251417756,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 652.09375,
      "completions/mean_terminated_length": 583.2222290039062,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.29012345679012347,
      "grad_norm": 2.293708714001795,
      "kl": 0.0368194580078125,
      "learning_rate": 4.99032875175248e-07,
      "loss": 0.1117,
      "num_tokens": 2186946.0,
      "reward": 0.053791485726833344,
      "reward_std": 0.04802599549293518,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0007405409123748541,
      "rewards/logprob_reward/std": 0.0019137536874040961,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 763.0,
      "completions/mean_length": 574.90625,
      "completions/mean_terminated_length": 510.7500305175781,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.2932098765432099,
      "grad_norm": 2.951710553313073,
      "kl": 0.049102783203125,
      "learning_rate": 4.989884454140724e-07,
      "loss": -0.0989,
      "num_tokens": 2211843.0,
      "reward": 0.05078906565904617,
      "reward_std": 0.047226108610630035,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0008767400868237019,
      "rewards/logprob_reward/std": 0.0018862105207517743,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 804.0,
      "completions/mean_length": 570.21875,
      "completions/mean_terminated_length": 505.39288330078125,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.2962962962962963,
      "grad_norm": 2.706009833282272,
      "kl": 0.04150390625,
      "learning_rate": 4.989430199576722e-07,
      "loss": -0.008,
      "num_tokens": 2236422.0,
      "reward": 0.045017652213573456,
      "reward_std": 0.04995456710457802,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0014085028087720275,
      "rewards/logprob_reward/std": 0.003322416450828314,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 758.0,
      "completions/mean_length": 543.375,
      "completions/mean_terminated_length": 511.3333740234375,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0.2993827160493827,
      "grad_norm": 2.7390931663000604,
      "kl": 0.0435791015625,
      "learning_rate": 4.988965989877022e-07,
      "loss": -0.0619,
      "num_tokens": 2260418.0,
      "reward": 0.05442715808749199,
      "reward_std": 0.04363364353775978,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0014468419831246138,
      "rewards/logprob_reward/std": 0.002777144545689225,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 565.0625,
      "completions/mean_terminated_length": 534.4666748046875,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "epoch": 0.30246913580246915,
      "grad_norm": 2.6940412752911413,
      "kl": 0.0467376708984375,
      "learning_rate": 4.988491826897978e-07,
      "loss": 0.0639,
      "num_tokens": 2285240.0,
      "reward": 0.06444443762302399,
      "reward_std": 0.0413370281457901,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0021604890935122967,
      "rewards/logprob_reward/std": 0.004000400193035603,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 614.59375,
      "completions/mean_terminated_length": 556.107177734375,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.3055555555555556,
      "grad_norm": 2.9928673723537904,
      "kl": 0.047119140625,
      "learning_rate": 4.988007712535752e-07,
      "loss": -0.0496,
      "num_tokens": 2310995.0,
      "reward": 0.05372615158557892,
      "reward_std": 0.0532066747546196,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0006679468788206577,
      "rewards/logprob_reward/std": 0.0012105830246582627,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 834.0,
      "completions/mean_length": 627.28125,
      "completions/mean_terminated_length": 516.2000122070312,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.30864197530864196,
      "grad_norm": 2.5027149167555587,
      "kl": 0.047332763671875,
      "learning_rate": 4.987513648726298e-07,
      "loss": -0.0632,
      "num_tokens": 2337436.0,
      "reward": 0.0507701113820076,
      "reward_std": 0.04203411191701889,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0008556764805689454,
      "rewards/logprob_reward/std": 0.0016093184240162373,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 607.34375,
      "completions/mean_terminated_length": 579.5667114257812,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.3117283950617284,
      "grad_norm": 3.139962304140252,
      "kl": 0.052276611328125,
      "learning_rate": 4.987009637445358e-07,
      "loss": 0.0031,
      "num_tokens": 2363355.0,
      "reward": 0.04761410504579544,
      "reward_std": 0.05293666571378708,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0008212258107960224,
      "rewards/logprob_reward/std": 0.0012592091225087643,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 527.75,
      "completions/mean_terminated_length": 494.66668701171875,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "epoch": 0.3148148148148148,
      "grad_norm": 2.883736141072442,
      "kl": 0.0460968017578125,
      "learning_rate": 4.986495680708453e-07,
      "loss": -0.0103,
      "num_tokens": 2386411.0,
      "reward": 0.06110449135303497,
      "reward_std": 0.05544377118349075,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0019216546788811684,
      "rewards/logprob_reward/std": 0.0034797852858901024,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 747.0,
      "completions/mean_length": 624.4375,
      "completions/mean_terminated_length": 512.5599975585938,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.31790123456790126,
      "grad_norm": 2.7624234295814936,
      "kl": 0.046630859375,
      "learning_rate": 4.985971780570878e-07,
      "loss": -0.047,
      "num_tokens": 2412781.0,
      "reward": 0.04408486187458038,
      "reward_std": 0.039759885519742966,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0003720703534781933,
      "rewards/logprob_reward/std": 0.0007752202218398452,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 590.0,
      "completions/mean_terminated_length": 545.1034545898438,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "epoch": 0.32098765432098764,
      "grad_norm": 3.1068981475336437,
      "kl": 0.051055908203125,
      "learning_rate": 4.985437939127687e-07,
      "loss": -0.0894,
      "num_tokens": 2438325.0,
      "reward": 0.05655660480260849,
      "reward_std": 0.046917624771595,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00034067226806655526,
      "rewards/logprob_reward/std": 0.000890880124643445,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 570.1875,
      "completions/mean_terminated_length": 505.357177734375,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.32407407407407407,
      "grad_norm": 2.4669522103941977,
      "kl": 0.0426177978515625,
      "learning_rate": 4.984894158513696e-07,
      "loss": -0.2237,
      "num_tokens": 2463147.0,
      "reward": 0.051290228962898254,
      "reward_std": 0.053184136748313904,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0014335874002426863,
      "rewards/logprob_reward/std": 0.003448633011430502,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 614.28125,
      "completions/mean_terminated_length": 571.8965454101562,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.3271604938271605,
      "grad_norm": 3.6235426676332723,
      "kl": 0.0612030029296875,
      "learning_rate": 4.984340440903456e-07,
      "loss": -0.1257,
      "num_tokens": 2489568.0,
      "reward": 0.05727202072739601,
      "reward_std": 0.0427728109061718,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0011355748865753412,
      "rewards/logprob_reward/std": 0.002726617967709899,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 748.0,
      "completions/mean_length": 545.75,
      "completions/mean_terminated_length": 496.2758483886719,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.33024691358024694,
      "grad_norm": 3.8062614866717412,
      "kl": 0.04974365234375,
      "learning_rate": 4.983776788511268e-07,
      "loss": -0.161,
      "num_tokens": 2513680.0,
      "reward": 0.04481091350317001,
      "reward_std": 0.040245793759822845,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0011787915136665106,
      "rewards/logprob_reward/std": 0.0018818581011146307,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 570.625,
      "completions/mean_terminated_length": 505.857177734375,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "epoch": 0.3333333333333333,
      "grad_norm": 3.061970292466625,
      "kl": 0.062469482421875,
      "learning_rate": 4.983203203591154e-07,
      "loss": -0.1396,
      "num_tokens": 2538524.0,
      "reward": 0.053908370435237885,
      "reward_std": 0.055700093507766724,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0008704091887921095,
      "rewards/logprob_reward/std": 0.003946096636354923,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 779.0,
      "completions/max_terminated_length": 779.0,
      "completions/mean_length": 494.8125,
      "completions/mean_terminated_length": 494.8125,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 0.33641975308641975,
      "grad_norm": 2.7202993772031805,
      "kl": 0.0513916015625,
      "learning_rate": 4.982619688436859e-07,
      "loss": -0.0298,
      "num_tokens": 2560690.0,
      "reward": 0.06022677570581436,
      "reward_std": 0.04589129984378815,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0009464181493967772,
      "rewards/logprob_reward/std": 0.0018110605888068676,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 542.15625,
      "completions/mean_terminated_length": 526.6129150390625,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.3395061728395062,
      "grad_norm": 3.346190718258864,
      "kl": 0.066192626953125,
      "learning_rate": 4.982026245381837e-07,
      "loss": -0.1211,
      "num_tokens": 2584415.0,
      "reward": 0.05337420850992203,
      "reward_std": 0.04603324830532074,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.000276896491413936,
      "rewards/logprob_reward/std": 0.0007598244701512158,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 823.0,
      "completions/mean_length": 609.75,
      "completions/mean_terminated_length": 533.0370483398438,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 0.3425925925925926,
      "grad_norm": 2.379277616100787,
      "kl": 0.0584259033203125,
      "learning_rate": 4.981422876799244e-07,
      "loss": -0.0572,
      "num_tokens": 2610763.0,
      "reward": 0.04142041876912117,
      "reward_std": 0.04397819936275482,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0008838011417537928,
      "rewards/logprob_reward/std": 0.0019359972793608904,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 575.5,
      "completions/mean_terminated_length": 561.0322265625,
      "completions/min_length": 306.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 0.345679012345679,
      "grad_norm": 2.736137122162934,
      "kl": 0.0489501953125,
      "learning_rate": 4.980809585101927e-07,
      "loss": -0.0628,
      "num_tokens": 2635719.0,
      "reward": 0.061229631304740906,
      "reward_std": 0.027119195088744164,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0020606969483196735,
      "rewards/logprob_reward/std": 0.0034382117446511984,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 832.0,
      "completions/mean_length": 566.625,
      "completions/mean_terminated_length": 536.1333618164062,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.3487654320987654,
      "grad_norm": 2.477840339001624,
      "kl": 0.054168701171875,
      "learning_rate": 4.980186372742417e-07,
      "loss": -0.0458,
      "num_tokens": 2659935.0,
      "reward": 0.06329086422920227,
      "reward_std": 0.04749561846256256,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.000878739170730114,
      "rewards/logprob_reward/std": 0.0020262636244297028,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 526.0,
      "completions/mean_terminated_length": 509.9354553222656,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.35185185185185186,
      "grad_norm": 3.5438811757063107,
      "kl": 0.09625244140625,
      "learning_rate": 4.979553242212917e-07,
      "loss": -0.1171,
      "num_tokens": 2683463.0,
      "reward": 0.05102846771478653,
      "reward_std": 0.04755294322967529,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0011427418794482946,
      "rewards/logprob_reward/std": 0.0026069299783557653,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 571.9375,
      "completions/mean_terminated_length": 541.800048828125,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.3549382716049383,
      "grad_norm": 2.5227547419462355,
      "kl": 0.049407958984375,
      "learning_rate": 4.978910196045291e-07,
      "loss": -0.1261,
      "num_tokens": 2708077.0,
      "reward": 0.06320597231388092,
      "reward_std": 0.04742685705423355,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0007844074862077832,
      "rewards/logprob_reward/std": 0.002121969358995557,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 576.28125,
      "completions/mean_terminated_length": 546.433349609375,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "epoch": 0.35802469135802467,
      "grad_norm": 7.359540387286494,
      "kl": 0.068756103515625,
      "learning_rate": 4.978257236811055e-07,
      "loss": -0.3935,
      "num_tokens": 2733578.0,
      "reward": 0.05617111921310425,
      "reward_std": 0.04427188262343407,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003384580370038748,
      "rewards/logprob_reward/std": 0.01107009407132864,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 885.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 546.4375,
      "completions/mean_terminated_length": 546.4375,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.3611111111111111,
      "grad_norm": 3.109979629121864,
      "kl": 0.0557861328125,
      "learning_rate": 4.977594367121369e-07,
      "loss": -0.1422,
      "num_tokens": 2757568.0,
      "reward": 0.05709470063447952,
      "reward_std": 0.0473439060151577,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0009385535959154367,
      "rewards/logprob_reward/std": 0.0019288675393909216,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 785.0,
      "completions/mean_length": 586.5625,
      "completions/mean_terminated_length": 524.0714721679688,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.36419753086419754,
      "grad_norm": 3.2522664113011843,
      "kl": 0.05938720703125,
      "learning_rate": 4.976921589627021e-07,
      "loss": -0.1721,
      "num_tokens": 2782918.0,
      "reward": 0.052726443856954575,
      "reward_std": 0.0450054295361042,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.006501602008938789,
      "rewards/logprob_reward/std": 0.015674972906708717,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 548.25,
      "completions/mean_terminated_length": 499.03448486328125,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "epoch": 0.36728395061728397,
      "grad_norm": 3.455605600846889,
      "kl": 0.057098388671875,
      "learning_rate": 4.976238907018427e-07,
      "loss": -0.2375,
      "num_tokens": 2807090.0,
      "reward": 0.06296929717063904,
      "reward_std": 0.03989600017666817,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0005214466946199536,
      "rewards/logprob_reward/std": 0.0011562422150745988,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 592.9375,
      "completions/mean_terminated_length": 513.1111450195312,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.37037037037037035,
      "grad_norm": 2.5785227908166624,
      "kl": 0.0533447265625,
      "learning_rate": 4.975546322025605e-07,
      "loss": -0.084,
      "num_tokens": 2832220.0,
      "reward": 0.05106581747531891,
      "reward_std": 0.05249335616827011,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.001184241147711873,
      "rewards/logprob_reward/std": 0.002151809399947524,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 609.90625,
      "completions/mean_terminated_length": 567.0689697265625,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.3734567901234568,
      "grad_norm": 4.374803977435889,
      "kl": 0.057281494140625,
      "learning_rate": 4.974843837418175e-07,
      "loss": -0.2325,
      "num_tokens": 2858537.0,
      "reward": 0.04909980297088623,
      "reward_std": 0.050683602690696716,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0024720027577131987,
      "rewards/logprob_reward/std": 0.0034602447412908077,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 819.0,
      "completions/mean_length": 545.53125,
      "completions/mean_terminated_length": 513.6333618164062,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 0.3765432098765432,
      "grad_norm": 2.8643841333726985,
      "kl": 0.059661865234375,
      "learning_rate": 4.974131456005349e-07,
      "loss": -0.2127,
      "num_tokens": 2882294.0,
      "reward": 0.060503534972667694,
      "reward_std": 0.05500777065753937,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0012539270101115108,
      "rewards/logprob_reward/std": 0.002895203186199069,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 837.0,
      "completions/mean_length": 536.9375,
      "completions/mean_terminated_length": 521.2257690429688,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "epoch": 0.37962962962962965,
      "grad_norm": 2.8088641308214384,
      "kl": 0.0654296875,
      "learning_rate": 4.973409180635911e-07,
      "loss": -0.0703,
      "num_tokens": 2906556.0,
      "reward": 0.057338517159223557,
      "reward_std": 0.047599297016859055,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0012094636913388968,
      "rewards/logprob_reward/std": 0.002666698070243001,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 785.0,
      "completions/mean_length": 568.875,
      "completions/mean_terminated_length": 538.5333862304688,
      "completions/min_length": 269.0,
      "completions/min_terminated_length": 269.0,
      "epoch": 0.38271604938271603,
      "grad_norm": 2.5218944395967955,
      "kl": 0.062347412109375,
      "learning_rate": 4.972677014198213e-07,
      "loss": -0.1082,
      "num_tokens": 2931476.0,
      "reward": 0.06511932611465454,
      "reward_std": 0.04857144504785538,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0029103620909154415,
      "rewards/logprob_reward/std": 0.004752908833324909,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 870.0,
      "completions/mean_length": 597.65625,
      "completions/mean_terminated_length": 569.2333374023438,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.38580246913580246,
      "grad_norm": 2.6358729093991218,
      "kl": 0.05462646484375,
      "learning_rate": 4.97193495962016e-07,
      "loss": -0.0485,
      "num_tokens": 2956733.0,
      "reward": 0.05737147852778435,
      "reward_std": 0.04644714668393135,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0012460858561098576,
      "rewards/logprob_reward/std": 0.0020421771332621574,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 607.15625,
      "completions/mean_terminated_length": 579.36669921875,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "epoch": 0.3888888888888889,
      "grad_norm": 3.516291466833583,
      "kl": 0.0527191162109375,
      "learning_rate": 4.971183019869201e-07,
      "loss": -0.2645,
      "num_tokens": 2982554.0,
      "reward": 0.04825485497713089,
      "reward_std": 0.04806240648031235,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001533173373900354,
      "rewards/logprob_reward/std": 0.002815558109432459,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 606.65625,
      "completions/mean_terminated_length": 547.0357666015625,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 0.39197530864197533,
      "grad_norm": 2.712705374458109,
      "kl": 0.052276611328125,
      "learning_rate": 4.970421197952311e-07,
      "loss": 0.0621,
      "num_tokens": 3008063.0,
      "reward": 0.048473648726940155,
      "reward_std": 0.046610329300165176,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001776272663846612,
      "rewards/logprob_reward/std": 0.0030606482177972794,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 610.34375,
      "completions/mean_terminated_length": 582.7667236328125,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.3950617283950617,
      "grad_norm": 2.536075529612535,
      "kl": 0.061492919921875,
      "learning_rate": 4.969649496915991e-07,
      "loss": 0.0236,
      "num_tokens": 3033902.0,
      "reward": 0.052005015313625336,
      "reward_std": 0.04411856085062027,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.00222779531031847,
      "rewards/logprob_reward/std": 0.005151921417564154,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 525.75,
      "completions/mean_terminated_length": 474.2069091796875,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.39814814814814814,
      "grad_norm": 3.011332619467256,
      "kl": 0.074005126953125,
      "learning_rate": 4.96886791984624e-07,
      "loss": -0.3031,
      "num_tokens": 3057122.0,
      "reward": 0.05376160889863968,
      "reward_std": 0.053026266396045685,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0007073450833559036,
      "rewards/logprob_reward/std": 0.0019466037629172206,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 702.0,
      "completions/mean_length": 490.75,
      "completions/mean_terminated_length": 455.20001220703125,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.4012345679012346,
      "grad_norm": 3.144798988669224,
      "kl": 0.072296142578125,
      "learning_rate": 4.968076469868558e-07,
      "loss": -0.2522,
      "num_tokens": 3079510.0,
      "reward": 0.05199963599443436,
      "reward_std": 0.04264131188392639,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0022218169178813696,
      "rewards/logprob_reward/std": 0.006449904292821884,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 776.0,
      "completions/mean_length": 519.71875,
      "completions/mean_terminated_length": 503.45159912109375,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.404320987654321,
      "grad_norm": 2.779361540038843,
      "kl": 0.0804443359375,
      "learning_rate": 4.967275150147921e-07,
      "loss": -0.2118,
      "num_tokens": 3102905.0,
      "reward": 0.0702369213104248,
      "reward_std": 0.0397680439054966,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0016521394718438387,
      "rewards/logprob_reward/std": 0.0031119592022150755,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 623.5,
      "completions/mean_terminated_length": 596.800048828125,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.4074074074074074,
      "grad_norm": 2.907337266435984,
      "kl": 0.25628662109375,
      "learning_rate": 4.966463963888775e-07,
      "loss": -0.0338,
      "num_tokens": 3129681.0,
      "reward": 0.07101751863956451,
      "reward_std": 0.02736024744808674,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0025194690097123384,
      "rewards/logprob_reward/std": 0.004677193239331245,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 856.0,
      "completions/mean_length": 572.78125,
      "completions/mean_terminated_length": 526.1034545898438,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "epoch": 0.4104938271604938,
      "grad_norm": 3.1986088549857206,
      "kl": 0.0638427734375,
      "learning_rate": 4.965642914335025e-07,
      "loss": -0.172,
      "num_tokens": 3154698.0,
      "reward": 0.04453890770673752,
      "reward_std": 0.04678146913647652,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0008765653474256396,
      "rewards/logprob_reward/std": 0.0020337984897196293,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 871.0,
      "completions/mean_length": 573.28125,
      "completions/mean_terminated_length": 558.741943359375,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.41358024691358025,
      "grad_norm": 2.7195080598710395,
      "kl": 0.071533203125,
      "learning_rate": 4.964812004770013e-07,
      "loss": -0.0662,
      "num_tokens": 3180315.0,
      "reward": 0.057729728519916534,
      "reward_std": 0.04083167761564255,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0016441469779238105,
      "rewards/logprob_reward/std": 0.003845022525638342,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 536.46875,
      "completions/mean_terminated_length": 503.9667053222656,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 0.4166666666666667,
      "grad_norm": 3.7904174216466835,
      "kl": 0.079833984375,
      "learning_rate": 4.963971238516519e-07,
      "loss": -0.2033,
      "num_tokens": 3203954.0,
      "reward": 0.04193691164255142,
      "reward_std": 0.041769422590732574,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0014576807152479887,
      "rewards/logprob_reward/std": 0.0031217315699905157,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 913.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 503.0,
      "completions/mean_terminated_length": 503.0,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "epoch": 0.41975308641975306,
      "grad_norm": 3.187683602245789,
      "kl": 0.062591552734375,
      "learning_rate": 4.963120618936732e-07,
      "loss": -0.1837,
      "num_tokens": 3226382.0,
      "reward": 0.042183056473731995,
      "reward_std": 0.04615149646997452,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0017311744159087539,
      "rewards/logprob_reward/std": 0.003340093418955803,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 857.0,
      "completions/mean_length": 572.40625,
      "completions/mean_terminated_length": 542.300048828125,
      "completions/min_length": 283.0,
      "completions/min_terminated_length": 283.0,
      "epoch": 0.4228395061728395,
      "grad_norm": 2.6112897569480227,
      "kl": 0.06365966796875,
      "learning_rate": 4.962260149432247e-07,
      "loss": -0.1889,
      "num_tokens": 3251047.0,
      "reward": 0.06047273054718971,
      "reward_std": 0.045773088932037354,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0012196984607726336,
      "rewards/logprob_reward/std": 0.002573272679001093,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 568.40625,
      "completions/mean_terminated_length": 553.7096557617188,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.42592592592592593,
      "grad_norm": 2.469587378558464,
      "kl": 0.062469482421875,
      "learning_rate": 4.96138983344405e-07,
      "loss": -0.155,
      "num_tokens": 3275408.0,
      "reward": 0.06650030612945557,
      "reward_std": 0.04502515867352486,
      "rewards/format_reward_func/mean": 0.65625,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0009725566487759352,
      "rewards/logprob_reward/std": 0.002437157789245248,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 817.0,
      "completions/mean_length": 573.4375,
      "completions/mean_terminated_length": 526.8275756835938,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.42901234567901236,
      "grad_norm": 2.328868779136458,
      "kl": 0.06634521484375,
      "learning_rate": 4.9605096744525e-07,
      "loss": -0.0233,
      "num_tokens": 3300246.0,
      "reward": 0.07066695392131805,
      "reward_std": 0.025806615129113197,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0021299482323229313,
      "rewards/logprob_reward/std": 0.004689618945121765,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 713.0,
      "completions/mean_length": 532.03125,
      "completions/mean_terminated_length": 516.1612548828125,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 0.43209876543209874,
      "grad_norm": 3.333962717233176,
      "kl": 0.107452392578125,
      "learning_rate": 4.95961967597732e-07,
      "loss": -0.14,
      "num_tokens": 3323807.0,
      "reward": 0.05786291882395744,
      "reward_std": 0.04674186185002327,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017921293620020151,
      "rewards/logprob_reward/std": 0.0031645300332456827,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 614.34375,
      "completions/mean_terminated_length": 571.9655151367188,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.4351851851851852,
      "grad_norm": 2.9684297054896156,
      "kl": 0.065948486328125,
      "learning_rate": 4.958719841577579e-07,
      "loss": -0.2246,
      "num_tokens": 3350090.0,
      "reward": 0.044370800256729126,
      "reward_std": 0.046897463500499725,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0006897771963849664,
      "rewards/logprob_reward/std": 0.001732326578348875,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 787.0,
      "completions/mean_length": 549.46875,
      "completions/mean_terminated_length": 517.8333740234375,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.4382716049382716,
      "grad_norm": 3.932832666658898,
      "kl": 0.0792236328125,
      "learning_rate": 4.957810174851679e-07,
      "loss": -0.1534,
      "num_tokens": 3374729.0,
      "reward": 0.04233923554420471,
      "reward_std": 0.05221627652645111,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.001904707052744925,
      "rewards/logprob_reward/std": 0.0034659402444958687,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 945.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 551.40625,
      "completions/mean_terminated_length": 551.40625,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 0.44135802469135804,
      "grad_norm": 2.544294118168252,
      "kl": 0.081085205078125,
      "learning_rate": 4.956890679437345e-07,
      "loss": -0.0192,
      "num_tokens": 3398474.0,
      "reward": 0.06388504803180695,
      "reward_std": 0.0397639200091362,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0015389358159154654,
      "rewards/logprob_reward/std": 0.0027499685529619455,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 934.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 570.96875,
      "completions/mean_terminated_length": 570.96875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.4444444444444444,
      "grad_norm": 2.9359685242986684,
      "kl": 0.066986083984375,
      "learning_rate": 4.955961359011601e-07,
      "loss": -0.1657,
      "num_tokens": 3423177.0,
      "reward": 0.03861922770738602,
      "reward_std": 0.046918563544750214,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0012435840908437967,
      "rewards/logprob_reward/std": 0.002308847848325968,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 601.78125,
      "completions/mean_terminated_length": 588.1612548828125,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "epoch": 0.44753086419753085,
      "grad_norm": 2.7023965534656407,
      "kl": 0.073272705078125,
      "learning_rate": 4.955022217290766e-07,
      "loss": -0.1101,
      "num_tokens": 3449090.0,
      "reward": 0.05682101473212242,
      "reward_std": 0.05433877557516098,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0006344596622511744,
      "rewards/logprob_reward/std": 0.0016948895063251257,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 560.46875,
      "completions/mean_terminated_length": 512.5172119140625,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 0.4506172839506173,
      "grad_norm": 2.986924507376984,
      "kl": 0.075164794921875,
      "learning_rate": 4.954073258030431e-07,
      "loss": 0.0157,
      "num_tokens": 3473221.0,
      "reward": 0.05381591618061066,
      "reward_std": 0.04771006107330322,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0007676836103200912,
      "rewards/logprob_reward/std": 0.0014980339910835028,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 553.6875,
      "completions/mean_terminated_length": 538.51611328125,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "epoch": 0.4537037037037037,
      "grad_norm": 2.843594232342128,
      "kl": 0.07666015625,
      "learning_rate": 4.953114485025446e-07,
      "loss": -0.2028,
      "num_tokens": 3497243.0,
      "reward": 0.044478774070739746,
      "reward_std": 0.054053470492362976,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0008097498212009668,
      "rewards/logprob_reward/std": 0.0022112694568932056,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 603.75,
      "completions/mean_terminated_length": 560.27587890625,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.4567901234567901,
      "grad_norm": 2.8092190763627793,
      "kl": 0.078277587890625,
      "learning_rate": 4.95214590210991e-07,
      "loss": -0.2411,
      "num_tokens": 3523095.0,
      "reward": 0.044111467897892,
      "reward_std": 0.04442127048969269,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0004016283492092043,
      "rewards/logprob_reward/std": 0.0010047410614788532,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 947.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 632.78125,
      "completions/mean_terminated_length": 632.78125,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.45987654320987653,
      "grad_norm": 2.5107977402015056,
      "kl": 0.072479248046875,
      "learning_rate": 4.951167513157147e-07,
      "loss": -0.1486,
      "num_tokens": 3550028.0,
      "reward": 0.060559920966625214,
      "reward_std": 0.051685988903045654,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0013165771961212158,
      "rewards/logprob_reward/std": 0.004063119646161795,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 559.25,
      "completions/mean_terminated_length": 528.2667236328125,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.46296296296296297,
      "grad_norm": 2.8056032826451127,
      "kl": 0.068939208984375,
      "learning_rate": 4.950179322079697e-07,
      "loss": -0.1228,
      "num_tokens": 3574460.0,
      "reward": 0.05674087256193161,
      "reward_std": 0.052086036652326584,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0005454131751321256,
      "rewards/logprob_reward/std": 0.001222002669237554,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 553.90625,
      "completions/mean_terminated_length": 505.2758483886719,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "epoch": 0.4660493827160494,
      "grad_norm": 3.213821541862956,
      "kl": 0.093719482421875,
      "learning_rate": 4.949181332829299e-07,
      "loss": -0.1993,
      "num_tokens": 3599049.0,
      "reward": 0.05664665251970291,
      "reward_std": 0.046812109649181366,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0004407258238643408,
      "rewards/logprob_reward/std": 0.001193308737128973,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 740.0,
      "completions/mean_length": 492.0,
      "completions/mean_terminated_length": 474.83868408203125,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "epoch": 0.4691358024691358,
      "grad_norm": 2.9182994246328415,
      "kl": 0.08349609375,
      "learning_rate": 4.948173549396873e-07,
      "loss": -0.0844,
      "num_tokens": 3621193.0,
      "reward": 0.04452269524335861,
      "reward_std": 0.04726994037628174,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0008585481555201113,
      "rewards/logprob_reward/std": 0.001919503789395094,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 638.375,
      "completions/mean_terminated_length": 566.9629516601562,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.4722222222222222,
      "grad_norm": 2.215355945019327,
      "kl": 0.07696533203125,
      "learning_rate": 4.947155975812506e-07,
      "loss": -0.1416,
      "num_tokens": 3648181.0,
      "reward": 0.050617851316928864,
      "reward_std": 0.03936385735869408,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0006865040631964803,
      "rewards/logprob_reward/std": 0.0015793000347912312,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 576.0625,
      "completions/mean_terminated_length": 561.6129150390625,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "epoch": 0.47530864197530864,
      "grad_norm": 3.4099798068408775,
      "kl": 0.08404541015625,
      "learning_rate": 4.946128616145436e-07,
      "loss": -0.3246,
      "num_tokens": 3673447.0,
      "reward": 0.0597553625702858,
      "reward_std": 0.04533667117357254,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0004226211167406291,
      "rewards/logprob_reward/std": 0.001463288557715714,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 835.0,
      "completions/max_terminated_length": 835.0,
      "completions/mean_length": 551.5625,
      "completions/mean_terminated_length": 551.5625,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.4783950617283951,
      "grad_norm": 2.6736789485757373,
      "kl": 0.07171630859375,
      "learning_rate": 4.945091474504037e-07,
      "loss": -0.0012,
      "num_tokens": 3697693.0,
      "reward": 0.06379599869251251,
      "reward_std": 0.027524448931217194,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.001439998159185052,
      "rewards/logprob_reward/std": 0.0025939131155610085,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 873.0,
      "completions/max_terminated_length": 873.0,
      "completions/mean_length": 576.09375,
      "completions/mean_terminated_length": 576.09375,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 0.48148148148148145,
      "grad_norm": 2.458439816355618,
      "kl": 0.08233642578125,
      "learning_rate": 4.944044555035793e-07,
      "loss": 0.0019,
      "num_tokens": 3723216.0,
      "reward": 0.06332716345787048,
      "reward_std": 0.03316696733236313,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0009190634591504931,
      "rewards/logprob_reward/std": 0.002278524450957775,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 878.0,
      "completions/mean_length": 572.71875,
      "completions/mean_terminated_length": 558.1612548828125,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.4845679012345679,
      "grad_norm": 2.1576719984246027,
      "kl": 0.076141357421875,
      "learning_rate": 4.9429878619273e-07,
      "loss": 0.0094,
      "num_tokens": 3747383.0,
      "reward": 0.04400166496634483,
      "reward_std": 0.03943231329321861,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0002796255284920335,
      "rewards/logprob_reward/std": 0.0007145073032006621,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 499.46875,
      "completions/mean_terminated_length": 482.5483703613281,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.4876543209876543,
      "grad_norm": 4.9867034473928875,
      "kl": 0.0755615234375,
      "learning_rate": 4.941921399404232e-07,
      "loss": -0.3322,
      "num_tokens": 3769338.0,
      "reward": 0.04752832651138306,
      "reward_std": 0.05379118397831917,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0007259194971993566,
      "rewards/logprob_reward/std": 0.0020718248561024666,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 822.0,
      "completions/mean_length": 541.3125,
      "completions/mean_terminated_length": 525.741943359375,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 0.49074074074074076,
      "grad_norm": 2.384406357181412,
      "kl": 0.0927734375,
      "learning_rate": 4.940845171731329e-07,
      "loss": -0.157,
      "num_tokens": 3793096.0,
      "reward": 0.07222440838813782,
      "reward_std": 0.03863812983036041,
      "rewards/format_reward_func/mean": 0.71875,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.00038822778151370585,
      "rewards/logprob_reward/std": 0.0012460780562832952,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 530.21875,
      "completions/mean_terminated_length": 514.290283203125,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 0.49382716049382713,
      "grad_norm": 2.677695822774252,
      "kl": 0.0936279296875,
      "learning_rate": 4.939759183212388e-07,
      "loss": -0.1621,
      "num_tokens": 3816327.0,
      "reward": 0.0631580650806427,
      "reward_std": 0.041855860501527786,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0007311837980523705,
      "rewards/logprob_reward/std": 0.001986136194318533,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 590.59375,
      "completions/mean_terminated_length": 528.6785888671875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.49691358024691357,
      "grad_norm": 2.5034008153124625,
      "kl": 0.10211181640625,
      "learning_rate": 4.938663438190232e-07,
      "loss": -0.121,
      "num_tokens": 3841958.0,
      "reward": 0.054024044424295425,
      "reward_std": 0.03780882805585861,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0009989351965487003,
      "rewards/logprob_reward/std": 0.0030066664330661297,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 919.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 545.5625,
      "completions/mean_terminated_length": 545.5625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.5,
      "grad_norm": 2.8793487536491207,
      "kl": 0.075836181640625,
      "learning_rate": 4.937557941046705e-07,
      "loss": -0.2787,
      "num_tokens": 3865608.0,
      "reward": 0.04443252086639404,
      "reward_std": 0.04472089558839798,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0007583519909530878,
      "rewards/logprob_reward/std": 0.0016288717743009329,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 879.0,
      "completions/mean_length": 600.1875,
      "completions/mean_terminated_length": 521.7037353515625,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.5030864197530864,
      "grad_norm": 4.3373373253550636,
      "kl": 0.088104248046875,
      "learning_rate": 4.936442696202648e-07,
      "loss": -0.2416,
      "num_tokens": 3890882.0,
      "reward": 0.0412057563662529,
      "reward_std": 0.05025993287563324,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.000645287218503654,
      "rewards/logprob_reward/std": 0.002340418053790927,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 687.6875,
      "completions/mean_terminated_length": 610.0769653320312,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.5061728395061729,
      "grad_norm": 2.2070094386749357,
      "kl": 0.07977294921875,
      "learning_rate": 4.935317708117881e-07,
      "loss": -0.1125,
      "num_tokens": 3919648.0,
      "reward": 0.047586582601070404,
      "reward_std": 0.04772721976041794,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.000790646648965776,
      "rewards/logprob_reward/std": 0.0019396664574742317,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 625.34375,
      "completions/mean_terminated_length": 598.7667236328125,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 0.5092592592592593,
      "grad_norm": 2.4562094566655577,
      "kl": 0.100830078125,
      "learning_rate": 4.934182981291187e-07,
      "loss": -0.0291,
      "num_tokens": 3946491.0,
      "reward": 0.05210525542497635,
      "reward_std": 0.040608860552310944,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0023391738068312407,
      "rewards/logprob_reward/std": 0.00513032078742981,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 864.0,
      "completions/mean_length": 635.0,
      "completions/mean_terminated_length": 579.4285888671875,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 0.5123456790123457,
      "grad_norm": 2.615833483797188,
      "kl": 0.093414306640625,
      "learning_rate": 4.933038520260299e-07,
      "loss": -0.0896,
      "num_tokens": 3973851.0,
      "reward": 0.038603805005550385,
      "reward_std": 0.032515063881874084,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0012264486867934465,
      "rewards/logprob_reward/std": 0.0025094689335674047,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 804.0,
      "completions/mean_length": 587.21875,
      "completions/mean_terminated_length": 542.0344848632812,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "epoch": 0.5154320987654321,
      "grad_norm": 2.715888113920899,
      "kl": 0.09375,
      "learning_rate": 4.931884329601869e-07,
      "loss": -0.1415,
      "num_tokens": 3999294.0,
      "reward": 0.04511518031358719,
      "reward_std": 0.052907492965459824,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0015168681275099516,
      "rewards/logprob_reward/std": 0.004005419556051493,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 597.84375,
      "completions/mean_terminated_length": 584.0967407226562,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.5185185185185185,
      "grad_norm": 2.7278046327751158,
      "kl": 0.09930419921875,
      "learning_rate": 4.930720413931463e-07,
      "loss": -0.1705,
      "num_tokens": 4025385.0,
      "reward": 0.04705619066953659,
      "reward_std": 0.04059264063835144,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.00020132274948991835,
      "rewards/logprob_reward/std": 0.0006632709410041571,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 842.0,
      "completions/mean_length": 618.6875,
      "completions/mean_terminated_length": 605.6129150390625,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.5216049382716049,
      "grad_norm": 1.982411689281292,
      "kl": 0.09539794921875,
      "learning_rate": 4.929546777903534e-07,
      "loss": -0.2065,
      "num_tokens": 4051479.0,
      "reward": 0.06299344450235367,
      "reward_std": 0.04481876641511917,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0005482725682668388,
      "rewards/logprob_reward/std": 0.0015520042506977916,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 590.0,
      "completions/mean_terminated_length": 561.0667114257812,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 0.5246913580246914,
      "grad_norm": 2.572582741579363,
      "kl": 0.0904541015625,
      "learning_rate": 4.928363426211407e-07,
      "loss": -0.1826,
      "num_tokens": 4076823.0,
      "reward": 0.04143887758255005,
      "reward_std": 0.04543890058994293,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0009043083409778774,
      "rewards/logprob_reward/std": 0.0023466269485652447,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 855.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 541.5625,
      "completions/mean_terminated_length": 541.5625,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.5277777777777778,
      "grad_norm": 2.731307993510179,
      "kl": 0.093017578125,
      "learning_rate": 4.927170363587262e-07,
      "loss": -0.1749,
      "num_tokens": 4100509.0,
      "reward": 0.050791189074516296,
      "reward_std": 0.039393700659275055,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0008790968568064272,
      "rewards/logprob_reward/std": 0.0023332065902650356,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 597.0625,
      "completions/mean_terminated_length": 568.6000366210938,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 0.5308641975308642,
      "grad_norm": 2.1871290908855476,
      "kl": 0.099365234375,
      "learning_rate": 4.925967594802109e-07,
      "loss": -0.1,
      "num_tokens": 4125855.0,
      "reward": 0.05643356591463089,
      "reward_std": 0.0411858856678009,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00020396310719661415,
      "rewards/logprob_reward/std": 0.0011045490391552448,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 861.0,
      "completions/max_terminated_length": 861.0,
      "completions/mean_length": 597.21875,
      "completions/mean_terminated_length": 597.21875,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 0.5339506172839507,
      "grad_norm": 2.44677423545142,
      "kl": 0.08740234375,
      "learning_rate": 4.924755124665774e-07,
      "loss": -0.1932,
      "num_tokens": 4151322.0,
      "reward": 0.05687759071588516,
      "reward_std": 0.03429555520415306,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0006973271956667304,
      "rewards/logprob_reward/std": 0.0018503706669434905,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 656.5625,
      "completions/mean_terminated_length": 632.0667114257812,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.5370370370370371,
      "grad_norm": 1.9145257120214048,
      "kl": 0.08758544921875,
      "learning_rate": 4.923532958026878e-07,
      "loss": -0.0852,
      "num_tokens": 4178220.0,
      "reward": 0.0600404292345047,
      "reward_std": 0.032063812017440796,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0007393647101707757,
      "rewards/logprob_reward/std": 0.0033789281733334064,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 566.8125,
      "completions/mean_terminated_length": 552.0645141601562,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "epoch": 0.5401234567901234,
      "grad_norm": 3.707207888615661,
      "kl": 0.0899658203125,
      "learning_rate": 4.922301099772821e-07,
      "loss": -0.3079,
      "num_tokens": 4202470.0,
      "reward": 0.04734790325164795,
      "reward_std": 0.053314127027988434,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0005254444549791515,
      "rewards/logprob_reward/std": 0.001620819210074842,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 617.75,
      "completions/mean_terminated_length": 590.6666870117188,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.5432098765432098,
      "grad_norm": 2.4510673949748543,
      "kl": 0.10162353515625,
      "learning_rate": 4.921059554829753e-07,
      "loss": -0.0792,
      "num_tokens": 4228838.0,
      "reward": 0.04168153554201126,
      "reward_std": 0.04795132577419281,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.001173929194919765,
      "rewards/logprob_reward/std": 0.0027986096683889627,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 896.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 608.875,
      "completions/mean_terminated_length": 608.875,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 0.5462962962962963,
      "grad_norm": 2.654131100571325,
      "kl": 0.11077880859375,
      "learning_rate": 4.91980832816257e-07,
      "loss": -0.1494,
      "num_tokens": 4254806.0,
      "reward": 0.057833198457956314,
      "reward_std": 0.0521986298263073,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001759108155965805,
      "rewards/logprob_reward/std": 0.0036081629805266857,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 803.0,
      "completions/mean_length": 564.46875,
      "completions/mean_terminated_length": 533.8333740234375,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 0.5493827160493827,
      "grad_norm": 2.870744145847357,
      "kl": 0.09637451171875,
      "learning_rate": 4.918547424774873e-07,
      "loss": -0.0505,
      "num_tokens": 4279321.0,
      "reward": 0.05646614730358124,
      "reward_std": 0.047001827508211136,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00024016425595618784,
      "rewards/logprob_reward/std": 0.001035280991345644,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 880.0,
      "completions/mean_length": 572.21875,
      "completions/mean_terminated_length": 557.6451416015625,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.5524691358024691,
      "grad_norm": 2.8791619627740213,
      "kl": 0.10540771484375,
      "learning_rate": 4.917276849708972e-07,
      "loss": -0.1143,
      "num_tokens": 4304416.0,
      "reward": 0.04742274433374405,
      "reward_std": 0.05460613593459129,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0006086068460717797,
      "rewards/logprob_reward/std": 0.0015793552156537771,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 567.375,
      "completions/mean_terminated_length": 536.933349609375,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.5555555555555556,
      "grad_norm": 2.760430399481095,
      "kl": 0.09259033203125,
      "learning_rate": 4.915996608045842e-07,
      "loss": -0.2382,
      "num_tokens": 4328968.0,
      "reward": 0.05662311986088753,
      "reward_std": 0.044914934784173965,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0004145758575759828,
      "rewards/logprob_reward/std": 0.0011752157006412745,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 970.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 564.5625,
      "completions/mean_terminated_length": 564.5625,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.558641975308642,
      "grad_norm": 2.754564018346516,
      "kl": 0.087188720703125,
      "learning_rate": 4.914706704905125e-07,
      "loss": -0.1993,
      "num_tokens": 4353026.0,
      "reward": 0.04729136824607849,
      "reward_std": 0.05444261059165001,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0004626337031368166,
      "rewards/logprob_reward/std": 0.0011854376643896103,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 921.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 607.875,
      "completions/mean_terminated_length": 607.875,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 0.5617283950617284,
      "grad_norm": 2.9974037878149566,
      "kl": 0.086883544921875,
      "learning_rate": 4.913407145445093e-07,
      "loss": -0.2072,
      "num_tokens": 4379070.0,
      "reward": 0.05068790540099144,
      "reward_std": 0.03280695900321007,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0007643378921784461,
      "rewards/logprob_reward/std": 0.0019737649708986282,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 555.0,
      "completions/mean_terminated_length": 523.7333374023438,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.5648148148148148,
      "grad_norm": 2.7019698751031886,
      "kl": 0.09429931640625,
      "learning_rate": 4.912097934862632e-07,
      "loss": -0.2045,
      "num_tokens": 4403086.0,
      "reward": 0.05067024007439613,
      "reward_std": 0.051602087914943695,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0007447098614647985,
      "rewards/logprob_reward/std": 0.0020433899480849504,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 653.46875,
      "completions/mean_terminated_length": 628.7667236328125,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "epoch": 0.5679012345679012,
      "grad_norm": 2.8024004394039803,
      "kl": 0.08929443359375,
      "learning_rate": 4.910779078393228e-07,
      "loss": -0.1117,
      "num_tokens": 4430317.0,
      "reward": 0.04114678502082825,
      "reward_std": 0.04779033362865448,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0005797599442303181,
      "rewards/logprob_reward/std": 0.001542158075608313,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 598.03125,
      "completions/mean_terminated_length": 553.9655151367188,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.5709876543209876,
      "grad_norm": 3.139001329290585,
      "kl": 0.09881591796875,
      "learning_rate": 4.909450581310935e-07,
      "loss": -0.1868,
      "num_tokens": 4455838.0,
      "reward": 0.046875,
      "reward_std": 0.04568375647068024,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 899.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 578.53125,
      "completions/mean_terminated_length": 578.53125,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 0.5740740740740741,
      "grad_norm": 2.715790957358654,
      "kl": 0.09686279296875,
      "learning_rate": 4.908112448928363e-07,
      "loss": -0.1373,
      "num_tokens": 4480711.0,
      "reward": 0.053631968796253204,
      "reward_std": 0.04750828444957733,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0005632966640405357,
      "rewards/logprob_reward/std": 0.001424881280399859,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 578.09375,
      "completions/mean_terminated_length": 548.36669921875,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 0.5771604938271605,
      "grad_norm": 2.4131258394211077,
      "kl": 0.10272216796875,
      "learning_rate": 4.906764686596651e-07,
      "loss": -0.2549,
      "num_tokens": 4505642.0,
      "reward": 0.04702848196029663,
      "reward_std": 0.04592697322368622,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0001705329050309956,
      "rewards/logprob_reward/std": 0.0006710395682603121,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1017.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 546.40625,
      "completions/mean_terminated_length": 546.40625,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.5802469135802469,
      "grad_norm": 2.708093528492381,
      "kl": 0.0911865234375,
      "learning_rate": 4.90540729970545e-07,
      "loss": -0.0681,
      "num_tokens": 4529631.0,
      "reward": 0.05057809129357338,
      "reward_std": 0.04895421117544174,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0006423235754482448,
      "rewards/logprob_reward/std": 0.0017255189595744014,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 655.625,
      "completions/mean_terminated_length": 603.0,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.5833333333333334,
      "grad_norm": 2.3868795689153552,
      "kl": 0.109130859375,
      "learning_rate": 4.904040293682897e-07,
      "loss": -0.1324,
      "num_tokens": 4557343.0,
      "reward": 0.0293910950422287,
      "reward_std": 0.03942374140024185,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0014067735755816102,
      "rewards/logprob_reward/std": 0.004575551021844149,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 580.0,
      "completions/mean_terminated_length": 550.4000244140625,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.5864197530864198,
      "grad_norm": 3.073421440606636,
      "kl": 0.11328125,
      "learning_rate": 4.902663673995597e-07,
      "loss": -0.0849,
      "num_tokens": 4582635.0,
      "reward": 0.05398586764931679,
      "reward_std": 0.04952556639909744,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0009565172949805856,
      "rewards/logprob_reward/std": 0.0022744557354599237,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 607.125,
      "completions/mean_terminated_length": 593.6774291992188,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 0.5895061728395061,
      "grad_norm": 2.599229833389622,
      "kl": 0.10601806640625,
      "learning_rate": 4.9012774461486e-07,
      "loss": -0.1981,
      "num_tokens": 4609015.0,
      "reward": 0.04128945991396904,
      "reward_std": 0.04696265980601311,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0007382894400507212,
      "rewards/logprob_reward/std": 0.0021007144823670387,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 553.3125,
      "completions/mean_terminated_length": 538.1290283203125,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.5925925925925926,
      "grad_norm": 3.088150746873547,
      "kl": 0.10693359375,
      "learning_rate": 4.899881615685376e-07,
      "loss": -0.2816,
      "num_tokens": 4633257.0,
      "reward": 0.04374999925494194,
      "reward_std": 0.051933757960796356,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 641.96875,
      "completions/mean_terminated_length": 602.4483032226562,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 0.595679012345679,
      "grad_norm": 2.4233293562287863,
      "kl": 0.1162109375,
      "learning_rate": 4.898476188187798e-07,
      "loss": -0.0939,
      "num_tokens": 4660224.0,
      "reward": 0.05658697336912155,
      "reward_std": 0.04677470028400421,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0003744146670214832,
      "rewards/logprob_reward/std": 0.0017363273072987795,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 588.875,
      "completions/mean_terminated_length": 574.8386840820312,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 0.5987654320987654,
      "grad_norm": 2.024359732059673,
      "kl": 0.09490966796875,
      "learning_rate": 4.897061169276118e-07,
      "loss": -0.0746,
      "num_tokens": 4685660.0,
      "reward": 0.05633340775966644,
      "reward_std": 0.03949132561683655,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 9.267372661270201e-05,
      "rewards/logprob_reward/std": 0.0005242417682893574,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 535.625,
      "completions/mean_terminated_length": 519.8709716796875,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "epoch": 0.6018518518518519,
      "grad_norm": 2.5964712865604556,
      "kl": 0.09527587890625,
      "learning_rate": 4.895636564608942e-07,
      "loss": -0.2392,
      "num_tokens": 4709340.0,
      "reward": 0.04751855507493019,
      "reward_std": 0.04732377082109451,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0007150607998482883,
      "rewards/logprob_reward/std": 0.0019512978615239263,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 809.0,
      "completions/mean_length": 594.09375,
      "completions/mean_terminated_length": 565.433349609375,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.6049382716049383,
      "grad_norm": 2.571796913593863,
      "kl": 0.108642578125,
      "learning_rate": 4.894202379883206e-07,
      "loss": -0.1736,
      "num_tokens": 4735019.0,
      "reward": 0.03775563836097717,
      "reward_std": 0.04141201078891754,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.00028404415934346616,
      "rewards/logprob_reward/std": 0.0008146684267558157,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 918.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 600.46875,
      "completions/mean_terminated_length": 600.46875,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 0.6080246913580247,
      "grad_norm": 2.9396181866146875,
      "kl": 0.10418701171875,
      "learning_rate": 4.892758620834165e-07,
      "loss": -0.1014,
      "num_tokens": 4760726.0,
      "reward": 0.05058019980788231,
      "reward_std": 0.05364552140235901,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0006446627667173743,
      "rewards/logprob_reward/std": 0.001384554896503687,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 620.0,
      "completions/mean_terminated_length": 578.2069091796875,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.6111111111111112,
      "grad_norm": 3.4608153300475,
      "kl": 0.11517333984375,
      "learning_rate": 4.891305293235351e-07,
      "loss": -0.2337,
      "num_tokens": 4787374.0,
      "reward": 0.03217801824212074,
      "reward_std": 0.040280796587467194,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0010311321821063757,
      "rewards/logprob_reward/std": 0.00260757259093225,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1001.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 588.5625,
      "completions/mean_terminated_length": 588.5625,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "epoch": 0.6141975308641975,
      "grad_norm": 3.293033384055398,
      "kl": 0.11273193359375,
      "learning_rate": 4.889842402898569e-07,
      "loss": -0.2187,
      "num_tokens": 4812480.0,
      "reward": 0.056570760905742645,
      "reward_std": 0.03236209228634834,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0003563996870070696,
      "rewards/logprob_reward/std": 0.0010288916528224945,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 556.40625,
      "completions/mean_terminated_length": 541.3225708007812,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "epoch": 0.6172839506172839,
      "grad_norm": 2.8393316824113106,
      "kl": 0.11126708984375,
      "learning_rate": 4.888369955673858e-07,
      "loss": -0.2211,
      "num_tokens": 4836969.0,
      "reward": 0.04425449296832085,
      "reward_std": 0.053492575883865356,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0005605471087619662,
      "rewards/logprob_reward/std": 0.0018193412106484175,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 677.90625,
      "completions/mean_terminated_length": 598.0385131835938,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.6203703703703703,
      "grad_norm": 2.4586059669310236,
      "kl": 0.104949951171875,
      "learning_rate": 4.88688795744948e-07,
      "loss": -0.033,
      "num_tokens": 4865474.0,
      "reward": 0.04104198142886162,
      "reward_std": 0.04574280232191086,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.00046331313205882907,
      "rewards/logprob_reward/std": 0.0013247487368062139,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 590.03125,
      "completions/mean_terminated_length": 561.1000366210938,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.6234567901234568,
      "grad_norm": 3.0975596149761806,
      "kl": 0.126708984375,
      "learning_rate": 4.885396414151888e-07,
      "loss": -0.277,
      "num_tokens": 4891327.0,
      "reward": 0.05083741247653961,
      "reward_std": 0.05358371138572693,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0009304598206654191,
      "rewards/logprob_reward/std": 0.0025314248632639647,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 645.1875,
      "completions/mean_terminated_length": 606.0,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.6265432098765432,
      "grad_norm": 2.969395557908256,
      "kl": 0.10601806640625,
      "learning_rate": 4.883895331745707e-07,
      "loss": -0.2939,
      "num_tokens": 4918445.0,
      "reward": 0.03783799707889557,
      "reward_std": 0.053672000765800476,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0003755528596229851,
      "rewards/logprob_reward/std": 0.0009694079053588212,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 602.09375,
      "completions/mean_terminated_length": 558.4483032226562,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.6296296296296297,
      "grad_norm": 2.6446376143799233,
      "kl": 0.106781005859375,
      "learning_rate": 4.882384716233709e-07,
      "loss": -0.0601,
      "num_tokens": 4944108.0,
      "reward": 0.03495427593588829,
      "reward_std": 0.03947734832763672,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0006436366238631308,
      "rewards/logprob_reward/std": 0.0021368886809796095,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 593.34375,
      "completions/mean_terminated_length": 579.4515991210938,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.6327160493827161,
      "grad_norm": 2.1730488753880812,
      "kl": 0.10675048828125,
      "learning_rate": 4.880864573656785e-07,
      "loss": -0.1947,
      "num_tokens": 4969399.0,
      "reward": 0.053266704082489014,
      "reward_std": 0.0473971962928772,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0001574465713929385,
      "rewards/logprob_reward/std": 0.0007713346858508885,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 616.90625,
      "completions/mean_terminated_length": 574.7930908203125,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 0.6358024691358025,
      "grad_norm": 2.732093927778844,
      "kl": 0.10491943359375,
      "learning_rate": 4.879334910093926e-07,
      "loss": -0.1368,
      "num_tokens": 4995388.0,
      "reward": 0.04427650570869446,
      "reward_std": 0.044918712228536606,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.000585003406740725,
      "rewards/logprob_reward/std": 0.0019014464924111962,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 826.0,
      "completions/mean_length": 561.5,
      "completions/mean_terminated_length": 475.85186767578125,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.6388888888888888,
      "grad_norm": 2.1492116299588457,
      "kl": 0.125244140625,
      "learning_rate": 4.877795731662202e-07,
      "loss": -0.2233,
      "num_tokens": 5019556.0,
      "reward": 0.0563012957572937,
      "reward_std": 0.041363805532455444,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 5.699522444047034e-05,
      "rewards/logprob_reward/std": 0.00022817167337052524,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 644.6875,
      "completions/mean_terminated_length": 619.4000244140625,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.6419753086419753,
      "grad_norm": 2.4248377093804447,
      "kl": 0.110107421875,
      "learning_rate": 4.876247044516724e-07,
      "loss": -0.0783,
      "num_tokens": 5046702.0,
      "reward": 0.028475604951381683,
      "reward_std": 0.031558647751808167,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0003895602421835065,
      "rewards/logprob_reward/std": 0.0011856276541948318,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 627.78125,
      "completions/mean_terminated_length": 571.1785888671875,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.6450617283950617,
      "grad_norm": 2.7390330955608304,
      "kl": 0.1085205078125,
      "learning_rate": 4.874688854850635e-07,
      "loss": -0.1363,
      "num_tokens": 5073107.0,
      "reward": 0.03207886219024658,
      "reward_std": 0.039595384150743484,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0009209603886120021,
      "rewards/logprob_reward/std": 0.002882851753383875,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 660.65625,
      "completions/mean_terminated_length": 539.5416870117188,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.6481481481481481,
      "grad_norm": 2.8889382030754427,
      "kl": 0.12890625,
      "learning_rate": 4.873121168895075e-07,
      "loss": -0.1702,
      "num_tokens": 5101032.0,
      "reward": 0.031383778899908066,
      "reward_std": 0.04649616777896881,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.00014864497643429786,
      "rewards/logprob_reward/std": 0.0005854673217982054,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 929.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 616.3125,
      "completions/mean_terminated_length": 616.3125,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.6512345679012346,
      "grad_norm": 2.8242270768474826,
      "kl": 0.10198974609375,
      "learning_rate": 4.87154399291916e-07,
      "loss": -0.1788,
      "num_tokens": 5126994.0,
      "reward": 0.04099217802286148,
      "reward_std": 0.047876451164484024,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0004079728969372809,
      "rewards/logprob_reward/std": 0.0014560659183189273,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 919.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 572.5,
      "completions/mean_terminated_length": 572.5,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "epoch": 0.654320987654321,
      "grad_norm": 2.6266475279642494,
      "kl": 0.12945556640625,
      "learning_rate": 4.869957333229955e-07,
      "loss": -0.2273,
      "num_tokens": 5151918.0,
      "reward": 0.04423429071903229,
      "reward_std": 0.04626988619565964,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0005380974616855383,
      "rewards/logprob_reward/std": 0.002099623205140233,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 633.0625,
      "completions/mean_terminated_length": 620.4515991210938,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.6574074074074074,
      "grad_norm": 2.484121195726362,
      "kl": 0.11895751953125,
      "learning_rate": 4.868361196172453e-07,
      "loss": -0.1246,
      "num_tokens": 5178844.0,
      "reward": 0.05328449606895447,
      "reward_std": 0.04550441354513168,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.00017721450421959162,
      "rewards/logprob_reward/std": 0.0010024766670539975,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 671.71875,
      "completions/mean_terminated_length": 635.27587890625,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.6604938271604939,
      "grad_norm": 2.1254880703665893,
      "kl": 0.10760498046875,
      "learning_rate": 4.866755588129542e-07,
      "loss": -0.0416,
      "num_tokens": 5206727.0,
      "reward": 0.05987684428691864,
      "reward_std": 0.04718624800443649,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0005576012772507966,
      "rewards/logprob_reward/std": 0.0027992515824735165,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 856.0,
      "completions/mean_length": 577.40625,
      "completions/mean_terminated_length": 563.0,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.6635802469135802,
      "grad_norm": 2.6988438433636692,
      "kl": 0.163818359375,
      "learning_rate": 4.86514051552199e-07,
      "loss": -0.1574,
      "num_tokens": 5232276.0,
      "reward": 0.060177482664585114,
      "reward_std": 0.03958515450358391,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0008916450897231698,
      "rewards/logprob_reward/std": 0.002548168646171689,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 630.15625,
      "completions/mean_terminated_length": 539.2692260742188,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "epoch": 0.6666666666666666,
      "grad_norm": 2.314573383950194,
      "kl": 0.118896484375,
      "learning_rate": 4.863515984808408e-07,
      "loss": -0.1538,
      "num_tokens": 5258621.0,
      "reward": 0.031334538012742996,
      "reward_std": 0.03406674042344093,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 9.393022628501058e-05,
      "rewards/logprob_reward/std": 0.0003963824128732085,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 631.0,
      "completions/mean_terminated_length": 604.800048828125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.6697530864197531,
      "grad_norm": 2.435706130506513,
      "kl": 0.139892578125,
      "learning_rate": 4.861882002485234e-07,
      "loss": -0.1688,
      "num_tokens": 5285157.0,
      "reward": 0.03769915550947189,
      "reward_std": 0.04663718491792679,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0002212801482528448,
      "rewards/logprob_reward/std": 0.0008804204408079386,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 643.03125,
      "completions/mean_terminated_length": 617.6333618164062,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.6728395061728395,
      "grad_norm": 2.410842172320329,
      "kl": 0.12322998046875,
      "learning_rate": 4.860238575086699e-07,
      "loss": -0.1206,
      "num_tokens": 5312018.0,
      "reward": 0.05332405865192413,
      "reward_std": 0.046081870794296265,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.00022117517073638737,
      "rewards/logprob_reward/std": 0.0012511557433754206,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 641.78125,
      "completions/mean_terminated_length": 629.4515991210938,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.6759259259259259,
      "grad_norm": 2.4154129091967533,
      "kl": 0.1409912109375,
      "learning_rate": 4.858585709184806e-07,
      "loss": -0.0788,
      "num_tokens": 5338995.0,
      "reward": 0.034696124494075775,
      "reward_std": 0.04090491682291031,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0003568080719560385,
      "rewards/logprob_reward/std": 0.0017188823549076915,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 772.0,
      "completions/max_terminated_length": 772.0,
      "completions/mean_length": 542.375,
      "completions/mean_terminated_length": 542.375,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.6790123456790124,
      "grad_norm": 2.0939959907633647,
      "kl": 0.141845703125,
      "learning_rate": 4.856923411389302e-07,
      "loss": -0.1423,
      "num_tokens": 5362559.0,
      "reward": 0.0443311408162117,
      "reward_std": 0.04073891416192055,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0006457103881984949,
      "rewards/logprob_reward/std": 0.001709131756797433,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 703.78125,
      "completions/mean_terminated_length": 629.8846435546875,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 0.6820987654320988,
      "grad_norm": 2.094011752914783,
      "kl": 0.1314697265625,
      "learning_rate": 4.855251688347653e-07,
      "loss": -0.0918,
      "num_tokens": 5391912.0,
      "reward": 0.04383155703544617,
      "reward_std": 0.0465577095746994,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 9.061659511644393e-05,
      "rewards/logprob_reward/std": 0.000512604892719537,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 637.125,
      "completions/mean_terminated_length": 624.6451416015625,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 0.6851851851851852,
      "grad_norm": 2.128294726656678,
      "kl": 0.1490478515625,
      "learning_rate": 4.853570546745014e-07,
      "loss": 0.005,
      "num_tokens": 5418584.0,
      "reward": 0.06256760656833649,
      "reward_std": 0.04150272160768509,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 7.511388685088605e-05,
      "rewards/logprob_reward/std": 0.0004249082994647324,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 690.84375,
      "completions/mean_terminated_length": 629.1481323242188,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 0.6882716049382716,
      "grad_norm": 2.3503052167681093,
      "kl": 0.1336669921875,
      "learning_rate": 4.851879993304208e-07,
      "loss": -0.0943,
      "num_tokens": 5446951.0,
      "reward": 0.03453109413385391,
      "reward_std": 0.045533955097198486,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.000173439402715303,
      "rewards/logprob_reward/std": 0.0007092207088135183,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 574.375,
      "completions/mean_terminated_length": 559.8709716796875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.691358024691358,
      "grad_norm": 2.6499183684428376,
      "kl": 0.1510009765625,
      "learning_rate": 4.850180034785691e-07,
      "loss": -0.0877,
      "num_tokens": 5472051.0,
      "reward": 0.04117923229932785,
      "reward_std": 0.041509099304676056,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0006158142350614071,
      "rewards/logprob_reward/std": 0.0034835711121559143,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 844.0,
      "completions/mean_length": 609.3125,
      "completions/mean_terminated_length": 595.9354858398438,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "epoch": 0.6944444444444444,
      "grad_norm": 2.155445771776111,
      "kl": 0.14617919921875,
      "learning_rate": 4.848470677987532e-07,
      "loss": -0.2251,
      "num_tokens": 5497749.0,
      "reward": 0.056270308792591095,
      "reward_std": 0.046627260744571686,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 2.2565931431017816e-05,
      "rewards/logprob_reward/std": 0.00012765217979904264,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 671.8125,
      "completions/mean_terminated_length": 606.5925903320312,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "epoch": 0.6975308641975309,
      "grad_norm": 2.274814183916674,
      "kl": 0.140380859375,
      "learning_rate": 4.846751929745383e-07,
      "loss": -0.0308,
      "num_tokens": 5525995.0,
      "reward": 0.04077719897031784,
      "reward_std": 0.04752274602651596,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.00016910559497773647,
      "rewards/logprob_reward/std": 0.0009566056542098522,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 622.125,
      "completions/mean_terminated_length": 580.5516967773438,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.7006172839506173,
      "grad_norm": 2.9197701183067477,
      "kl": 0.166259765625,
      "learning_rate": 4.845023796932454e-07,
      "loss": -0.1223,
      "num_tokens": 5552743.0,
      "reward": 0.05314267426729202,
      "reward_std": 0.040380291640758514,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 1.963327122211922e-05,
      "rewards/logprob_reward/std": 0.00011106255260528997,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 867.0,
      "completions/mean_length": 589.125,
      "completions/mean_terminated_length": 560.1333618164062,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.7037037037037037,
      "grad_norm": 2.424408425246981,
      "kl": 0.1558837890625,
      "learning_rate": 4.84328628645948e-07,
      "loss": -0.0422,
      "num_tokens": 5577511.0,
      "reward": 0.037620242685079575,
      "reward_std": 0.04844558984041214,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.00013360078446567059,
      "rewards/logprob_reward/std": 0.00046856305561959743,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 596.21875,
      "completions/mean_terminated_length": 582.4193115234375,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 0.7067901234567902,
      "grad_norm": 2.062889286939384,
      "kl": 0.16064453125,
      "learning_rate": 4.841539405274698e-07,
      "loss": -0.069,
      "num_tokens": 5603286.0,
      "reward": 0.05941256880760193,
      "reward_std": 0.04757439345121384,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 4.17455485148821e-05,
      "rewards/logprob_reward/std": 0.00023614846577402204,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 665.03125,
      "completions/mean_terminated_length": 613.75,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.7098765432098766,
      "grad_norm": 2.3825616687668183,
      "kl": 0.1590576171875,
      "learning_rate": 4.839783160363821e-07,
      "loss": -0.1477,
      "num_tokens": 5631183.0,
      "reward": 0.031745754182338715,
      "reward_std": 0.0247428547590971,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0005508353933691978,
      "rewards/logprob_reward/std": 0.0031159953214228153,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 835.0,
      "completions/mean_length": 616.21875,
      "completions/mean_terminated_length": 540.7037353515625,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.7129629629629629,
      "grad_norm": 2.285236303731177,
      "kl": 0.1536865234375,
      "learning_rate": 4.838017558750004e-07,
      "loss": -0.0934,
      "num_tokens": 5657486.0,
      "reward": 0.03437500074505806,
      "reward_std": 0.03125,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 683.65625,
      "completions/mean_terminated_length": 635.0357666015625,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.7160493827160493,
      "grad_norm": 2.0886870733529688,
      "kl": 0.1519775390625,
      "learning_rate": 4.836242607493819e-07,
      "loss": -0.1256,
      "num_tokens": 5686087.0,
      "reward": 0.04076904058456421,
      "reward_std": 0.04559372365474701,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.00016004232747945935,
      "rewards/logprob_reward/std": 0.0009053361136466265,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 652.1875,
      "completions/mean_terminated_length": 627.4000244140625,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.7191358024691358,
      "grad_norm": 1.9978822850201081,
      "kl": 0.15631103515625,
      "learning_rate": 4.834458313693228e-07,
      "loss": -0.1412,
      "num_tokens": 5713269.0,
      "reward": 0.04694700241088867,
      "reward_std": 0.047571003437042236,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 8.000048546819016e-05,
      "rewards/logprob_reward/std": 0.0004525511176325381,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 625.28125,
      "completions/mean_terminated_length": 584.0344848632812,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "epoch": 0.7222222222222222,
      "grad_norm": 2.1953735198767794,
      "kl": 0.163818359375,
      "learning_rate": 4.832664684483555e-07,
      "loss": -0.1073,
      "num_tokens": 5739410.0,
      "reward": 0.03125,
      "reward_std": 0.039433758705854416,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 934.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 605.15625,
      "completions/mean_terminated_length": 605.15625,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 0.7253086419753086,
      "grad_norm": 2.350257186190819,
      "kl": 0.16217041015625,
      "learning_rate": 4.830861727037453e-07,
      "loss": -0.1122,
      "num_tokens": 5765107.0,
      "reward": 0.04065612331032753,
      "reward_std": 0.047597043216228485,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 3.458164428593591e-05,
      "rewards/logprob_reward/std": 0.00019562333181966096,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 977.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 599.53125,
      "completions/mean_terminated_length": 599.53125,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.7283950617283951,
      "grad_norm": 3.016768158405979,
      "kl": 0.16583251953125,
      "learning_rate": 4.82904944856488e-07,
      "loss": -0.2691,
      "num_tokens": 5790592.0,
      "reward": 0.05000000447034836,
      "reward_std": 0.046650636941194534,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 665.09375,
      "completions/mean_terminated_length": 641.1666870117188,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.7314814814814815,
      "grad_norm": 1.7930722744681806,
      "kl": 0.1746826171875,
      "learning_rate": 4.827227856313066e-07,
      "loss": -0.0918,
      "num_tokens": 5818535.0,
      "reward": 0.046875,
      "reward_std": 0.033183757215738297,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 652.34375,
      "completions/mean_terminated_length": 613.8965454101562,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.7345679012345679,
      "grad_norm": 2.53665479529064,
      "kl": 0.16156005859375,
      "learning_rate": 4.825396957566491e-07,
      "loss": -0.1198,
      "num_tokens": 5846002.0,
      "reward": 0.056460775434970856,
      "reward_std": 0.05556637793779373,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00023419583158101887,
      "rewards/logprob_reward/std": 0.0013248117174953222,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 650.625,
      "completions/mean_terminated_length": 597.2857666015625,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.7376543209876543,
      "grad_norm": 2.328099770784432,
      "kl": 0.16650390625,
      "learning_rate": 4.823556759646847e-07,
      "loss": -0.1603,
      "num_tokens": 5872854.0,
      "reward": 0.04088599234819412,
      "reward_std": 0.03160203993320465,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.00028999114874750376,
      "rewards/logprob_reward/std": 0.0012471069348976016,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 680.03125,
      "completions/mean_terminated_length": 616.3333129882812,
      "completions/min_length": 292.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 0.7407407407407407,
      "grad_norm": 2.1795366927271536,
      "kl": 0.1593017578125,
      "learning_rate": 4.821707269913016e-07,
      "loss": -0.1534,
      "num_tokens": 5901135.0,
      "reward": 0.040657684206962585,
      "reward_std": 0.038401514291763306,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 3.631383879110217e-05,
      "rewards/logprob_reward/std": 0.0002054220822174102,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 960.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 624.6875,
      "completions/mean_terminated_length": 624.6875,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.7438271604938271,
      "grad_norm": 3.011508723204471,
      "kl": 0.18048095703125,
      "learning_rate": 4.819848495761037e-07,
      "loss": -0.2283,
      "num_tokens": 5927709.0,
      "reward": 0.03750000149011612,
      "reward_std": 0.05386751517653465,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 870.0,
      "completions/mean_length": 603.53125,
      "completions/mean_terminated_length": 560.0344848632812,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.7469135802469136,
      "grad_norm": 2.6892546568064466,
      "kl": 0.1878662109375,
      "learning_rate": 4.817980444624076e-07,
      "loss": -0.0577,
      "num_tokens": 5953886.0,
      "reward": 0.03154817223548889,
      "reward_std": 0.04138335958123207,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0003313017077744007,
      "rewards/logprob_reward/std": 0.0012565014185383916,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 611.59375,
      "completions/mean_terminated_length": 552.6785888671875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.75,
      "grad_norm": 2.3164282719071725,
      "kl": 0.1785888671875,
      "learning_rate": 4.816103123972395e-07,
      "loss": -0.0893,
      "num_tokens": 5979949.0,
      "reward": 0.02188117988407612,
      "reward_std": 0.03317964822053909,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 6.866796866233926e-06,
      "rewards/logprob_reward/std": 3.8844467781018466e-05,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 640.03125,
      "completions/mean_terminated_length": 627.6451416015625,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.7530864197530864,
      "grad_norm": 2.5360110689160513,
      "kl": 0.189208984375,
      "learning_rate": 4.814216541313329e-07,
      "loss": -0.119,
      "num_tokens": 6007210.0,
      "reward": 0.05000000074505806,
      "reward_std": 0.05386751517653465,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 574.9375,
      "completions/mean_terminated_length": 545.0,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.7561728395061729,
      "grad_norm": 2.862147931219411,
      "kl": 0.1947021484375,
      "learning_rate": 4.812320704191252e-07,
      "loss": -0.1728,
      "num_tokens": 6032380.0,
      "reward": 0.04378309100866318,
      "reward_std": 0.04661262780427933,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 3.676979031297378e-05,
      "rewards/logprob_reward/std": 0.00020800135098397732,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 880.0,
      "completions/mean_length": 657.0,
      "completions/mean_terminated_length": 632.5333862304688,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.7592592592592593,
      "grad_norm": 1.905636384943984,
      "kl": 0.1708984375,
      "learning_rate": 4.81041562018754e-07,
      "loss": -0.0456,
      "num_tokens": 6059756.0,
      "reward": 0.031550176441669464,
      "reward_std": 0.03263639286160469,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.00033352768514305353,
      "rewards/logprob_reward/std": 0.0015560640022158623,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 638.84375,
      "completions/mean_terminated_length": 626.4193115234375,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.7623456790123457,
      "grad_norm": 2.316538484469351,
      "kl": 0.16217041015625,
      "learning_rate": 4.808501296920552e-07,
      "loss": -0.0881,
      "num_tokens": 6086767.0,
      "reward": 0.03137172758579254,
      "reward_std": 0.04463999718427658,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.00013525362010113895,
      "rewards/logprob_reward/std": 0.0007651100168004632,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 625.03125,
      "completions/mean_terminated_length": 598.433349609375,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.7654320987654321,
      "grad_norm": 2.787586722028665,
      "kl": 0.199462890625,
      "learning_rate": 4.806577742045593e-07,
      "loss": -0.117,
      "num_tokens": 6113352.0,
      "reward": 0.04375000298023224,
      "reward_std": 0.039433758705854416,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 665.1875,
      "completions/mean_terminated_length": 628.0689697265625,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.7685185185185185,
      "grad_norm": 2.0220631710175896,
      "kl": 0.179443359375,
      "learning_rate": 4.804644963254887e-07,
      "loss": -0.0636,
      "num_tokens": 6140810.0,
      "reward": 0.04062499850988388,
      "reward_std": 0.04761751741170883,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 559.96875,
      "completions/mean_terminated_length": 545.0,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.7716049382716049,
      "grad_norm": 2.0315025235024637,
      "kl": 0.17327880859375,
      "learning_rate": 4.80270296827754e-07,
      "loss": -0.1317,
      "num_tokens": 6164657.0,
      "reward": 0.046875,
      "reward_std": 0.04568375647068024,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 655.9375,
      "completions/mean_terminated_length": 617.862060546875,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 0.7746913580246914,
      "grad_norm": 1.5902732578748726,
      "kl": 0.1885986328125,
      "learning_rate": 4.800751764879516e-07,
      "loss": 0.0153,
      "num_tokens": 6191683.0,
      "reward": 0.05624999850988388,
      "reward_std": 0.034150637686252594,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 648.65625,
      "completions/mean_terminated_length": 609.8275756835938,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.7777777777777778,
      "grad_norm": 2.1517295561708334,
      "kl": 0.177978515625,
      "learning_rate": 4.798791360863602e-07,
      "loss": -0.0841,
      "num_tokens": 6219104.0,
      "reward": 0.028237810358405113,
      "reward_std": 0.04027276858687401,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.00012534351844806224,
      "rewards/logprob_reward/std": 0.00070905004395172,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 709.625,
      "completions/mean_terminated_length": 651.4074096679688,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 0.7808641975308642,
      "grad_norm": 2.1099299720320883,
      "kl": 0.17547607421875,
      "learning_rate": 4.796821764069378e-07,
      "loss": -0.0995,
      "num_tokens": 6248428.0,
      "reward": 0.031371042132377625,
      "reward_std": 0.0465136282145977,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.00013449120160657912,
      "rewards/logprob_reward/std": 0.0007607970619574189,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 844.0,
      "completions/mean_length": 633.96875,
      "completions/mean_terminated_length": 578.25,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "epoch": 0.7839506172839507,
      "grad_norm": 1.9882778612144554,
      "kl": 0.1922607421875,
      "learning_rate": 4.794842982373188e-07,
      "loss": -0.0389,
      "num_tokens": 6275199.0,
      "reward": 0.046908486634492874,
      "reward_std": 0.040423277765512466,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 3.720493259606883e-05,
      "rewards/logprob_reward/std": 0.0002104628802044317,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 644.5625,
      "completions/mean_terminated_length": 619.2667236328125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.7870370370370371,
      "grad_norm": 2.14624151505934,
      "kl": 0.1773681640625,
      "learning_rate": 4.7928550236881e-07,
      "loss": -0.115,
      "num_tokens": 6302429.0,
      "reward": 0.03125,
      "reward_std": 0.039433758705854416,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 629.1875,
      "completions/mean_terminated_length": 572.7857666015625,
      "completions/min_length": 306.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 0.7901234567901234,
      "grad_norm": 1.6876928844785541,
      "kl": 0.1802978515625,
      "learning_rate": 4.790857895963888e-07,
      "loss": -0.1124,
      "num_tokens": 6329195.0,
      "reward": 0.03437500074505806,
      "reward_std": 0.033183757215738297,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 717.5625,
      "completions/mean_terminated_length": 631.760009765625,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.7932098765432098,
      "grad_norm": 2.0101325941957104,
      "kl": 0.1756591796875,
      "learning_rate": 4.788851607186988e-07,
      "loss": -0.0888,
      "num_tokens": 6359709.0,
      "reward": 0.015625,
      "reward_std": 0.03125,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 693.03125,
      "completions/mean_terminated_length": 645.75,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 0.7962962962962963,
      "grad_norm": 1.9579710425311287,
      "kl": 0.180419921875,
      "learning_rate": 4.786836165380472e-07,
      "loss": -0.08,
      "num_tokens": 6388346.0,
      "reward": 0.03446173667907715,
      "reward_std": 0.040301889181137085,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 9.637646871851757e-05,
      "rewards/logprob_reward/std": 0.0005451876204460859,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 709.34375,
      "completions/mean_terminated_length": 636.7307739257812,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.7993827160493827,
      "grad_norm": 1.7946421677104327,
      "kl": 0.15673828125,
      "learning_rate": 4.784811578604013e-07,
      "loss": -0.0107,
      "num_tokens": 6417713.0,
      "reward": 0.03437500074505806,
      "reward_std": 0.03846687823534012,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 690.03125,
      "completions/mean_terminated_length": 667.7667236328125,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 0.8024691358024691,
      "grad_norm": 2.0942031317541856,
      "kl": 0.16387939453125,
      "learning_rate": 4.782777854953857e-07,
      "loss": -0.0174,
      "num_tokens": 6446446.0,
      "reward": 0.022133365273475647,
      "reward_std": 0.033654019236564636,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0002870730822905898,
      "rewards/logprob_reward/std": 0.0015146363293752074,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 735.9375,
      "completions/mean_terminated_length": 682.5925903320312,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 0.8055555555555556,
      "grad_norm": 2.042868542930924,
      "kl": 0.175537109375,
      "learning_rate": 4.780735002562785e-07,
      "loss": -0.0075,
      "num_tokens": 6476684.0,
      "reward": 0.03125,
      "reward_std": 0.046650636941194534,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 655.09375,
      "completions/mean_terminated_length": 616.9310302734375,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.808641975308642,
      "grad_norm": 2.447517932403874,
      "kl": 0.1767578125,
      "learning_rate": 4.778683029600089e-07,
      "loss": -0.1325,
      "num_tokens": 6504075.0,
      "reward": 0.0375194288790226,
      "reward_std": 0.05192091315984726,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 2.1584961359621957e-05,
      "rewards/logprob_reward/std": 0.00012210298154968768,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 715.09375,
      "completions/mean_terminated_length": 657.888916015625,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.8117283950617284,
      "grad_norm": 1.875741545500234,
      "kl": 0.18011474609375,
      "learning_rate": 4.776621944271526e-07,
      "loss": -0.0333,
      "num_tokens": 6533122.0,
      "reward": 0.03437500074505806,
      "reward_std": 0.03846687823534012,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 682.5625,
      "completions/mean_terminated_length": 603.7692260742188,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.8148148148148148,
      "grad_norm": 1.8667237803366017,
      "kl": 0.1600341796875,
      "learning_rate": 4.774551754819299e-07,
      "loss": -0.0557,
      "num_tokens": 6561848.0,
      "reward": 0.025137821212410927,
      "reward_std": 0.03209925442934036,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0001531346351839602,
      "rewards/logprob_reward/std": 0.0006108815432526171,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 651.21875,
      "completions/mean_terminated_length": 626.36669921875,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 0.8179012345679012,
      "grad_norm": 2.2028515956195074,
      "kl": 0.180908203125,
      "learning_rate": 4.772472469522015e-07,
      "loss": -0.0599,
      "num_tokens": 6589463.0,
      "reward": 0.04062499850988388,
      "reward_std": 0.04761751741170883,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 614.375,
      "completions/mean_terminated_length": 601.1612548828125,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.8209876543209876,
      "grad_norm": 1.7546127139870733,
      "kl": 0.1773681640625,
      "learning_rate": 4.770384096694658e-07,
      "loss": -0.087,
      "num_tokens": 6615663.0,
      "reward": 0.0375639870762825,
      "reward_std": 0.039305780082941055,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 7.109808211680502e-05,
      "rewards/logprob_reward/std": 0.00040219147922471166,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 942.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 612.71875,
      "completions/mean_terminated_length": 612.71875,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.8240740740740741,
      "grad_norm": 2.473014319654774,
      "kl": 0.195068359375,
      "learning_rate": 4.7682866446885475e-07,
      "loss": -0.2364,
      "num_tokens": 6641934.0,
      "reward": 0.05369546636939049,
      "reward_std": 0.051967013627290726,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0006338492967188358,
      "rewards/logprob_reward/std": 0.0025384912732988596,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 631.09375,
      "completions/mean_terminated_length": 604.9000244140625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.8271604938271605,
      "grad_norm": 1.9919107132515261,
      "kl": 0.1748046875,
      "learning_rate": 4.766180121891316e-07,
      "loss": -0.1175,
      "num_tokens": 6668405.0,
      "reward": 0.04062500223517418,
      "reward_std": 0.04040063917636871,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 689.53125,
      "completions/mean_terminated_length": 667.2333984375,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0.8302469135802469,
      "grad_norm": 1.9712023706550943,
      "kl": 0.160888671875,
      "learning_rate": 4.7640645367268663e-07,
      "loss": -0.0685,
      "num_tokens": 6696922.0,
      "reward": 0.03437500074505806,
      "reward_std": 0.04040063917636871,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 689.0,
      "completions/mean_terminated_length": 666.6666870117188,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.8333333333333334,
      "grad_norm": 2.2532725263612514,
      "kl": 0.1661376953125,
      "learning_rate": 4.761939897655343e-07,
      "loss": -0.2022,
      "num_tokens": 6725354.0,
      "reward": 0.04394396394491196,
      "reward_std": 0.048724181950092316,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00021551498502958566,
      "rewards/logprob_reward/std": 0.0012191367568448186,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 748.59375,
      "completions/mean_terminated_length": 730.2333984375,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 0.8364197530864198,
      "grad_norm": 1.2982697679548028,
      "kl": 0.1802978515625,
      "learning_rate": 4.7598062131730943e-07,
      "loss": -0.0056,
      "num_tokens": 6755825.0,
      "reward": 0.01875000074505806,
      "reward_std": 0.019716879352927208,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 706.46875,
      "completions/mean_terminated_length": 673.6206665039062,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.8395061728395061,
      "grad_norm": 2.212537028222659,
      "kl": 0.1746826171875,
      "learning_rate": 4.757663491812644e-07,
      "loss": -0.0554,
      "num_tokens": 6784996.0,
      "reward": 0.03750000149011612,
      "reward_std": 0.046650636941194534,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 665.03125,
      "completions/mean_terminated_length": 613.75,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.8425925925925926,
      "grad_norm": 1.464750024079652,
      "kl": 0.1788330078125,
      "learning_rate": 4.755511742142652e-07,
      "loss": 0.0147,
      "num_tokens": 6812541.0,
      "reward": 0.01875000074505806,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 713.21875,
      "completions/mean_terminated_length": 668.8214721679688,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 0.845679012345679,
      "grad_norm": 1.6818283675716525,
      "kl": 0.1953125,
      "learning_rate": 4.753350972767883e-07,
      "loss": 0.0092,
      "num_tokens": 6841740.0,
      "reward": 0.02530525252223015,
      "reward_std": 0.03476113826036453,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0003391671634744853,
      "rewards/logprob_reward/std": 0.001918619149364531,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 691.1875,
      "completions/mean_terminated_length": 629.5555419921875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.8487654320987654,
      "grad_norm": 1.902781376698886,
      "kl": 0.1510009765625,
      "learning_rate": 4.75118119232917e-07,
      "loss": -0.0528,
      "num_tokens": 6870134.0,
      "reward": 0.02187499962747097,
      "reward_std": 0.03125,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 648.0,
      "completions/mean_terminated_length": 609.1034545898438,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.8518518518518519,
      "grad_norm": 1.8544806027546288,
      "kl": 0.1793212890625,
      "learning_rate": 4.749002409503382e-07,
      "loss": -0.0605,
      "num_tokens": 6897170.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.04040063917636871,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 731.15625,
      "completions/mean_terminated_length": 649.1599731445312,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "epoch": 0.8549382716049383,
      "grad_norm": 2.106745038986706,
      "kl": 0.1778564453125,
      "learning_rate": 4.7468146330033874e-07,
      "loss": -0.0921,
      "num_tokens": 6927391.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.04568375647068024,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 734.375,
      "completions/mean_terminated_length": 653.2799682617188,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 0.8580246913580247,
      "grad_norm": 1.9572245973206377,
      "kl": 0.2008056640625,
      "learning_rate": 4.7446178715780213e-07,
      "loss": -0.0408,
      "num_tokens": 6957539.0,
      "reward": 0.03125,
      "reward_std": 0.04136751592159271,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 760.53125,
      "completions/mean_terminated_length": 699.7307739257812,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.8611111111111112,
      "grad_norm": 1.6558992288536742,
      "kl": 0.17498779296875,
      "learning_rate": 4.742412134012047e-07,
      "loss": -0.0159,
      "num_tokens": 6988232.0,
      "reward": 0.01875000074505806,
      "reward_std": 0.03750000149011612,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 693.875,
      "completions/mean_terminated_length": 671.86669921875,
      "completions/min_length": 333.0,
      "completions/min_terminated_length": 333.0,
      "epoch": 0.8641975308641975,
      "grad_norm": 1.6715855208905281,
      "kl": 0.1986083984375,
      "learning_rate": 4.740197429126125e-07,
      "loss": -0.0083,
      "num_tokens": 7016524.0,
      "reward": 0.03750000149011612,
      "reward_std": 0.039433758705854416,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 769.09375,
      "completions/mean_terminated_length": 697.719970703125,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.8672839506172839,
      "grad_norm": 1.015516224027939,
      "kl": 0.183349609375,
      "learning_rate": 4.7379737657767745e-07,
      "loss": -0.0242,
      "num_tokens": 7048051.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 743.5625,
      "completions/mean_terminated_length": 703.5000610351562,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.8703703703703703,
      "grad_norm": 1.6338389583240374,
      "kl": 0.18310546875,
      "learning_rate": 4.7357411528563393e-07,
      "loss": -0.0296,
      "num_tokens": 7078457.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 817.75,
      "completions/mean_terminated_length": 737.0435180664062,
      "completions/min_length": 527.0,
      "completions/min_terminated_length": 527.0,
      "epoch": 0.8734567901234568,
      "grad_norm": 1.3637746511034814,
      "kl": 0.172607421875,
      "learning_rate": 4.733499599292955e-07,
      "loss": -0.0083,
      "num_tokens": 7111677.0,
      "reward": 0.01875000074505806,
      "reward_std": 0.026933757588267326,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 738.09375,
      "completions/mean_terminated_length": 685.1481323242188,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 0.8765432098765432,
      "grad_norm": 1.4644497167448591,
      "kl": 0.190185546875,
      "learning_rate": 4.7312491140505064e-07,
      "loss": 0.001,
      "num_tokens": 7142244.0,
      "reward": 0.012586712837219238,
      "reward_std": 0.019661229103803635,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 9.634702291805297e-05,
      "rewards/logprob_reward/std": 0.0005450210883282125,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 746.3125,
      "completions/mean_terminated_length": 706.6428833007812,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 0.8796296296296297,
      "grad_norm": 0.9598329914024589,
      "kl": 0.1812744140625,
      "learning_rate": 4.7289897061285965e-07,
      "loss": -0.0125,
      "num_tokens": 7172806.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 751.53125,
      "completions/mean_terminated_length": 712.607177734375,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 0.8827160493827161,
      "grad_norm": 1.6261381639015626,
      "kl": 0.1966552734375,
      "learning_rate": 4.726721384562513e-07,
      "loss": 0.0051,
      "num_tokens": 7203255.0,
      "reward": 0.01579144224524498,
      "reward_std": 0.03158288449048996,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0001849345862865448,
      "rewards/logprob_reward/std": 0.0008219811716116965,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 722.5,
      "completions/mean_terminated_length": 702.4000244140625,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.8858024691358025,
      "grad_norm": 3.3427490662652533,
      "kl": 2.1798095703125,
      "learning_rate": 4.724444158423185e-07,
      "loss": -0.1087,
      "num_tokens": 7233075.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 727.46875,
      "completions/mean_terminated_length": 644.4400024414062,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0.8888888888888888,
      "grad_norm": 1.5374078937953224,
      "kl": 0.19140625,
      "learning_rate": 4.722158036817154e-07,
      "loss": 0.017,
      "num_tokens": 7262850.0,
      "reward": 0.01875000074505806,
      "reward_std": 0.0322168804705143,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 751.375,
      "completions/mean_terminated_length": 712.4285888671875,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 0.8919753086419753,
      "grad_norm": 1.1899686771082865,
      "kl": 0.202392578125,
      "learning_rate": 4.7198630288865304e-07,
      "loss": -0.0259,
      "num_tokens": 7293198.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 716.03125,
      "completions/mean_terminated_length": 684.1724243164062,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.8950617283950617,
      "grad_norm": 1.6683712729905364,
      "kl": 0.2373046875,
      "learning_rate": 4.7175591438089646e-07,
      "loss": -0.0445,
      "num_tokens": 7322647.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 737.78125,
      "completions/mean_terminated_length": 657.6400146484375,
      "completions/min_length": 295.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 0.8981481481481481,
      "grad_norm": 1.3376678952734042,
      "kl": 0.1954345703125,
      "learning_rate": 4.7152463907976024e-07,
      "loss": -0.0169,
      "num_tokens": 7352640.0,
      "reward": 0.01875000074505806,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 726.03125,
      "completions/mean_terminated_length": 657.2692260742188,
      "completions/min_length": 221.0,
      "completions/min_terminated_length": 221.0,
      "epoch": 0.9012345679012346,
      "grad_norm": 1.5295454118951755,
      "kl": 0.1878662109375,
      "learning_rate": 4.7129247791010563e-07,
      "loss": 0.0067,
      "num_tokens": 7382261.0,
      "reward": 0.02812500111758709,
      "reward_std": 0.040400635451078415,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 786.46875,
      "completions/mean_terminated_length": 719.9599609375,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.904320987654321,
      "grad_norm": 1.1736265111480144,
      "kl": 0.2108154296875,
      "learning_rate": 4.710594318003361e-07,
      "loss": 0.0235,
      "num_tokens": 7414016.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.019716879352927208,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 749.21875,
      "completions/mean_terminated_length": 672.2799682617188,
      "completions/min_length": 314.0,
      "completions/min_terminated_length": 314.0,
      "epoch": 0.9074074074074074,
      "grad_norm": 0.6293748884007017,
      "kl": 0.2100830078125,
      "learning_rate": 4.7082550168239423e-07,
      "loss": 0.0103,
      "num_tokens": 7444459.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 767.4375,
      "completions/mean_terminated_length": 708.2307739257812,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 0.9104938271604939,
      "grad_norm": 1.3906735964347727,
      "kl": 0.2081298828125,
      "learning_rate": 4.705906884917573e-07,
      "loss": -0.0229,
      "num_tokens": 7475525.0,
      "reward": 0.015625,
      "reward_std": 0.025966878980398178,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 860.03125,
      "completions/mean_terminated_length": 774.1428833007812,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 0.9135802469135802,
      "grad_norm": 1.1260611752928005,
      "kl": 0.18505859375,
      "learning_rate": 4.703549931674345e-07,
      "loss": 0.0285,
      "num_tokens": 7509594.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.019716879352927208,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 741.84375,
      "completions/mean_terminated_length": 689.5925903320312,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.9166666666666666,
      "grad_norm": 1.0722793437822726,
      "kl": 0.2017822265625,
      "learning_rate": 4.7011841665196227e-07,
      "loss": 0.0239,
      "num_tokens": 7539833.0,
      "reward": 0.015625,
      "reward_std": 0.020683757960796356,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 696.9375,
      "completions/mean_terminated_length": 663.1034545898438,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "epoch": 0.9197530864197531,
      "grad_norm": 1.7028779933369635,
      "kl": 0.215087890625,
      "learning_rate": 4.6988095989140096e-07,
      "loss": -0.0488,
      "num_tokens": 7568163.0,
      "reward": 0.009437083266675472,
      "reward_std": 0.018874166533350945,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 6.898110586917028e-05,
      "rewards/logprob_reward/std": 0.0003902160678990185,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 840.46875,
      "completions/mean_terminated_length": 757.0454711914062,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 0.9228395061728395,
      "grad_norm": 0.7766438983512843,
      "kl": 0.1976318359375,
      "learning_rate": 4.6964262383533114e-07,
      "loss": -0.0189,
      "num_tokens": 7601986.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 706.34375,
      "completions/mean_terminated_length": 673.4827270507812,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "epoch": 0.9259259259259259,
      "grad_norm": 0.9381400098964588,
      "kl": 0.198974609375,
      "learning_rate": 4.694034094368495e-07,
      "loss": 0.0088,
      "num_tokens": 7630801.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 830.59375,
      "completions/mean_terminated_length": 766.125,
      "completions/min_length": 580.0,
      "completions/min_terminated_length": 580.0,
      "epoch": 0.9290123456790124,
      "grad_norm": 1.001710780823997,
      "kl": 0.2493896484375,
      "learning_rate": 4.691633176525651e-07,
      "loss": 0.0101,
      "num_tokens": 7663768.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.013466878794133663,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 727.0625,
      "completions/mean_terminated_length": 643.9199829101562,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 0.9320987654320988,
      "grad_norm": 1.1882821653317572,
      "kl": 0.217529296875,
      "learning_rate": 4.689223494425959e-07,
      "loss": 0.0133,
      "num_tokens": 7693190.0,
      "reward": 0.006325289607048035,
      "reward_std": 0.01265057921409607,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 8.365471148863435e-05,
      "rewards/logprob_reward/std": 0.0004732225206680596,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 698.78125,
      "completions/mean_terminated_length": 623.7307739257812,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.9351851851851852,
      "grad_norm": 1.0455868863314848,
      "kl": 0.2236328125,
      "learning_rate": 4.686805057705645e-07,
      "loss": 0.036,
      "num_tokens": 7722127.0,
      "reward": 0.009409812279045582,
      "reward_std": 0.013536503538489342,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 3.868029307341203e-05,
      "rewards/logprob_reward/std": 0.00021880878193769604,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 799.09375,
      "completions/mean_terminated_length": 711.0869750976562,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.9382716049382716,
      "grad_norm": 0.8643090576442451,
      "kl": 0.2166748046875,
      "learning_rate": 4.684377876035944e-07,
      "loss": -0.0242,
      "num_tokens": 7753850.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 720.21875,
      "completions/mean_terminated_length": 676.8214721679688,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 0.941358024691358,
      "grad_norm": 1.4831676362735404,
      "kl": 0.189453125,
      "learning_rate": 4.681941959123063e-07,
      "loss": -0.0018,
      "num_tokens": 7782765.0,
      "reward": 0.00959782488644123,
      "reward_std": 0.01919564977288246,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0002475832006894052,
      "rewards/logprob_reward/std": 0.0010625235736370087,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 719.40625,
      "completions/mean_terminated_length": 675.8928833007812,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.9444444444444444,
      "grad_norm": 1.716708272903363,
      "kl": 0.2091064453125,
      "learning_rate": 4.6794973167081397e-07,
      "loss": -0.0398,
      "num_tokens": 7812450.0,
      "reward": 0.012772508896887302,
      "reward_std": 0.025545017793774605,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.00030278731719590724,
      "rewards/logprob_reward/std": 0.0013578330399468541,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 877.0,
      "completions/mean_length": 702.59375,
      "completions/mean_terminated_length": 595.4583740234375,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "epoch": 0.9475308641975309,
      "grad_norm": 0.009440908633934682,
      "kl": 0.2236328125,
      "learning_rate": 4.6770439585672046e-07,
      "loss": 0.0002,
      "num_tokens": 7841141.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 731.28125,
      "completions/mean_terminated_length": 701.0,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 0.9506172839506173,
      "grad_norm": 0.8410116745335985,
      "kl": 0.20556640625,
      "learning_rate": 4.6745818945111426e-07,
      "loss": -0.0108,
      "num_tokens": 7870938.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.007216878701001406,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 716.75,
      "completions/mean_terminated_length": 645.84619140625,
      "completions/min_length": 267.0,
      "completions/min_terminated_length": 267.0,
      "epoch": 0.9537037037037037,
      "grad_norm": 0.8943559824894225,
      "kl": 0.2095947265625,
      "learning_rate": 4.6721111343856547e-07,
      "loss": 0.0214,
      "num_tokens": 7900022.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 732.53125,
      "completions/mean_terminated_length": 665.2692260742188,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.9567901234567902,
      "grad_norm": 1.1298479953842961,
      "kl": 0.212646484375,
      "learning_rate": 4.669631688071214e-07,
      "loss": -0.017,
      "num_tokens": 7930307.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 742.59375,
      "completions/mean_terminated_length": 723.8333740234375,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 0.9598765432098766,
      "grad_norm": 1.3189588503979779,
      "kl": 0.20068359375,
      "learning_rate": 4.667143565483032e-07,
      "loss": 0.0114,
      "num_tokens": 7960646.0,
      "reward": 0.015625,
      "reward_std": 0.025966878980398178,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 645.40625,
      "completions/mean_terminated_length": 633.1935424804688,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 0.9629629629629629,
      "grad_norm": 1.6241557360798045,
      "kl": 0.251953125,
      "learning_rate": 4.664646776571015e-07,
      "loss": -0.0593,
      "num_tokens": 7987431.0,
      "reward": 0.015625,
      "reward_std": 0.025966878980398178,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 680.8125,
      "completions/mean_terminated_length": 617.25927734375,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.9660493827160493,
      "grad_norm": 1.6394861513909418,
      "kl": 0.2274169921875,
      "learning_rate": 4.662141331319726e-07,
      "loss": -0.0584,
      "num_tokens": 8015461.0,
      "reward": 0.015625,
      "reward_std": 0.025966878980398178,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 770.0,
      "completions/mean_terminated_length": 670.6087036132812,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.9691358024691358,
      "grad_norm": 6.867913802332101,
      "kl": 2.4453125,
      "learning_rate": 4.6596272397483445e-07,
      "loss": -0.0283,
      "num_tokens": 8047569.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 722.71875,
      "completions/mean_terminated_length": 666.9259033203125,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.9722222222222222,
      "grad_norm": 0.8842193813197436,
      "kl": 0.222412109375,
      "learning_rate": 4.657104511910626e-07,
      "loss": 0.0198,
      "num_tokens": 8076892.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.013466878794133663,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 792.96875,
      "completions/mean_terminated_length": 750.1851806640625,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.9753086419753086,
      "grad_norm": 0.8702597020094852,
      "kl": 0.2034912109375,
      "learning_rate": 4.654573157894861e-07,
      "loss": 0.071,
      "num_tokens": 8108903.0,
      "reward": 0.0034241636749356985,
      "reward_std": 0.006848327349871397,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00033240404445677996,
      "rewards/logprob_reward/std": 0.0013712375657632947,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 730.3125,
      "completions/mean_terminated_length": 615.3912963867188,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.9783950617283951,
      "grad_norm": 1.4219115095707027,
      "kl": 0.2218017578125,
      "learning_rate": 4.652033187823838e-07,
      "loss": -0.006,
      "num_tokens": 8138441.0,
      "reward": 0.015625,
      "reward_std": 0.025966878980398178,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 802.75,
      "completions/mean_terminated_length": 740.7999877929688,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 0.9814814814814815,
      "grad_norm": 0.8242556230145498,
      "kl": 0.237060546875,
      "learning_rate": 4.6494846118548e-07,
      "loss": 0.0014,
      "num_tokens": 8171069.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.014433757402002811,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 798.9375,
      "completions/mean_terminated_length": 696.6364135742188,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 0.9845679012345679,
      "grad_norm": 0.8312399161974559,
      "kl": 0.2086181640625,
      "learning_rate": 4.6469274401794044e-07,
      "loss": -0.0002,
      "num_tokens": 8203523.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 758.0625,
      "completions/mean_terminated_length": 696.6923217773438,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 0.9876543209876543,
      "grad_norm": 1.2649856030150175,
      "kl": 0.210205078125,
      "learning_rate": 4.6443616830236823e-07,
      "loss": -0.0077,
      "num_tokens": 8234337.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 743.875,
      "completions/mean_terminated_length": 665.4400024414062,
      "completions/min_length": 295.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 0.9907407407407407,
      "grad_norm": 0.9759895177682247,
      "kl": 0.20849609375,
      "learning_rate": 4.641787350647997e-07,
      "loss": -0.0152,
      "num_tokens": 8264817.0,
      "reward": 0.006368691101670265,
      "reward_std": 0.01242492999881506,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.000131878987303935,
      "rewards/logprob_reward/std": 0.0007460201741196215,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 867.0,
      "completions/mean_length": 747.9375,
      "completions/mean_terminated_length": 670.6400146484375,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 0.9938271604938271,
      "grad_norm": 0.4973960755968238,
      "kl": 0.2061767578125,
      "learning_rate": 4.6392044533470053e-07,
      "loss": 0.0084,
      "num_tokens": 8294963.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 750.0625,
      "completions/mean_terminated_length": 710.9285888671875,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.9969135802469136,
      "grad_norm": 1.3680398351724956,
      "kl": 0.2352294921875,
      "learning_rate": 4.636613001449615e-07,
      "loss": -0.1376,
      "num_tokens": 8325681.0,
      "reward": 0.021934330463409424,
      "reward_std": 0.025899026542901993,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 6.592279532924294e-05,
      "rewards/logprob_reward/std": 0.00037291564512997866,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 763.1875,
      "completions/mean_terminated_length": 676.25,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 1.0,
      "grad_norm": 0.9804013605757141,
      "kl": 0.209716796875,
      "learning_rate": 4.6340130053189417e-07,
      "loss": -0.0133,
      "num_tokens": 8356563.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 751.875,
      "completions/mean_terminated_length": 713.0000610351562,
      "completions/min_length": 283.0,
      "completions/min_terminated_length": 283.0,
      "epoch": 1.0030864197530864,
      "grad_norm": 0.7121054896570292,
      "kl": 0.2164306640625,
      "learning_rate": 4.6314044753522703e-07,
      "loss": -0.0285,
      "num_tokens": 8387331.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.007216878701001406,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 807.09375,
      "completions/mean_terminated_length": 722.2174072265625,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 1.0061728395061729,
      "grad_norm": 0.7481577546494053,
      "kl": 0.2098388671875,
      "learning_rate": 4.6287874219810117e-07,
      "loss": -0.0078,
      "num_tokens": 8419610.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 678.46875,
      "completions/mean_terminated_length": 642.72412109375,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 1.0092592592592593,
      "grad_norm": 0.6604872270971704,
      "kl": 0.24267578125,
      "learning_rate": 4.626161855670663e-07,
      "loss": -0.0227,
      "num_tokens": 8447693.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 718.8125,
      "completions/mean_terminated_length": 633.3599853515625,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 1.0123456790123457,
      "grad_norm": 1.6790013679876383,
      "kl": 0.223876953125,
      "learning_rate": 4.623527786920761e-07,
      "loss": -0.0138,
      "num_tokens": 8477111.0,
      "reward": 0.015625,
      "reward_std": 0.025966878980398178,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 715.75,
      "completions/mean_terminated_length": 671.7142944335938,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 1.0154320987654322,
      "grad_norm": 1.6554652829660685,
      "kl": 0.244140625,
      "learning_rate": 4.620885226264847e-07,
      "loss": -0.092,
      "num_tokens": 8506231.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 699.78125,
      "completions/mean_terminated_length": 666.2413940429688,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.0185185185185186,
      "grad_norm": 1.361173488498756,
      "kl": 0.2412109375,
      "learning_rate": 4.6182341842704177e-07,
      "loss": -0.057,
      "num_tokens": 8534736.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 777.8125,
      "completions/mean_terminated_length": 681.478271484375,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 1.021604938271605,
      "grad_norm": 0.009587588033230739,
      "kl": 0.22265625,
      "learning_rate": 4.6155746715388903e-07,
      "loss": 0.0002,
      "num_tokens": 8565954.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 690.65625,
      "completions/mean_terminated_length": 628.9259033203125,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 1.0246913580246915,
      "grad_norm": 0.6248176904592802,
      "kl": 0.2423095703125,
      "learning_rate": 4.6129066987055533e-07,
      "loss": 0.0088,
      "num_tokens": 8594007.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 763.875,
      "completions/mean_terminated_length": 677.1666870117188,
      "completions/min_length": 316.0,
      "completions/min_terminated_length": 316.0,
      "epoch": 1.0277777777777777,
      "grad_norm": 1.2574969659485429,
      "kl": 0.2169189453125,
      "learning_rate": 4.610230276439526e-07,
      "loss": -0.0467,
      "num_tokens": 8625231.0,
      "reward": 0.006498053669929504,
      "reward_std": 0.012996107339859009,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0002756151370704174,
      "rewards/logprob_reward/std": 0.0015591145493090153,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 694.71875,
      "completions/mean_terminated_length": 660.6551513671875,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 1.0308641975308641,
      "grad_norm": 1.4501169847195978,
      "kl": 0.2662353515625,
      "learning_rate": 4.607545415443721e-07,
      "loss": -0.0524,
      "num_tokens": 8653974.0,
      "reward": 0.01875000074505806,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 815.125,
      "completions/mean_terminated_length": 756.6399536132812,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 1.0339506172839505,
      "grad_norm": 0.8971923851075063,
      "kl": 0.248779296875,
      "learning_rate": 4.604852126454792e-07,
      "loss": 0.0044,
      "num_tokens": 8686338.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.013466878794133663,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 829.375,
      "completions/mean_terminated_length": 712.6000366210938,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 1.037037037037037,
      "grad_norm": 0.009864169482841535,
      "kl": 0.251708984375,
      "learning_rate": 4.6021504202430983e-07,
      "loss": 0.0003,
      "num_tokens": 8719574.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 788.90625,
      "completions/mean_terminated_length": 665.7619018554688,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 1.0401234567901234,
      "grad_norm": 0.8852924568708636,
      "kl": 0.24755859375,
      "learning_rate": 4.599440307612661e-07,
      "loss": -0.0153,
      "num_tokens": 8751939.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 695.9375,
      "completions/mean_terminated_length": 649.0714721679688,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 1.0432098765432098,
      "grad_norm": 0.6502115458136378,
      "kl": 0.249755859375,
      "learning_rate": 4.5967217994011144e-07,
      "loss": 0.0045,
      "num_tokens": 8780457.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 776.375,
      "completions/mean_terminated_length": 719.2307739257812,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 1.0462962962962963,
      "grad_norm": 0.8923014443797044,
      "kl": 0.2325439453125,
      "learning_rate": 4.593994906479669e-07,
      "loss": 0.005,
      "num_tokens": 8811485.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 819.78125,
      "completions/mean_terminated_length": 739.8695678710938,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 1.0493827160493827,
      "grad_norm": 0.8974915320488661,
      "kl": 0.2322998046875,
      "learning_rate": 4.591259639753066e-07,
      "loss": 0.0091,
      "num_tokens": 8844558.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 740.25,
      "completions/mean_terminated_length": 710.8965454101562,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 1.0524691358024691,
      "grad_norm": 0.02576346005578449,
      "kl": 0.273681640625,
      "learning_rate": 4.588516010159529e-07,
      "loss": 0.0003,
      "num_tokens": 8874546.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 818.25,
      "completions/mean_terminated_length": 677.4736938476562,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 1.0555555555555556,
      "grad_norm": 0.9750758651930964,
      "kl": 0.23486328125,
      "learning_rate": 4.58576402867073e-07,
      "loss": 0.0302,
      "num_tokens": 8907694.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 800.0625,
      "completions/mean_terminated_length": 725.4166870117188,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 1.058641975308642,
      "grad_norm": 1.0796010864414354,
      "kl": 0.2349853515625,
      "learning_rate": 4.5830037062917373e-07,
      "loss": -0.0512,
      "num_tokens": 8939304.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 896.59375,
      "completions/mean_terminated_length": 769.1875,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 1.0617283950617284,
      "grad_norm": 0.007417990712236422,
      "kl": 0.2103271484375,
      "learning_rate": 4.580235054060971e-07,
      "loss": 0.0002,
      "num_tokens": 8974923.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 729.84375,
      "completions/mean_terminated_length": 661.9615478515625,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 1.0648148148148149,
      "grad_norm": 0.8842294127676017,
      "kl": 0.2591552734375,
      "learning_rate": 4.5774580830501685e-07,
      "loss": 0.0112,
      "num_tokens": 9004490.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 842.40625,
      "completions/mean_terminated_length": 718.1578979492188,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 1.0679012345679013,
      "grad_norm": 1.2648262971659887,
      "kl": 0.2259521484375,
      "learning_rate": 4.574672804364329e-07,
      "loss": -0.0847,
      "num_tokens": 9038055.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 881.84375,
      "completions/mean_terminated_length": 771.2777709960938,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 1.0709876543209877,
      "grad_norm": 0.5589430325783781,
      "kl": 0.2591552734375,
      "learning_rate": 4.571879229141674e-07,
      "loss": 0.0211,
      "num_tokens": 9073238.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 790.4375,
      "completions/mean_terminated_length": 725.0399780273438,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.074074074074074,
      "grad_norm": 0.9491041643011326,
      "kl": 0.220458984375,
      "learning_rate": 4.5690773685536037e-07,
      "loss": 0.0157,
      "num_tokens": 9104660.0,
      "reward": 0.0032900888472795486,
      "reward_std": 0.006580177694559097,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00018343192641623318,
      "rewards/logprob_reward/std": 0.0010376477148383856,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 811.25,
      "completions/mean_terminated_length": 683.6000366210938,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 1.0771604938271604,
      "grad_norm": 0.007563171515114929,
      "kl": 0.228515625,
      "learning_rate": 4.5662672338046513e-07,
      "loss": 0.0002,
      "num_tokens": 9137156.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 861.15625,
      "completions/mean_terminated_length": 763.4500122070312,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 1.0802469135802468,
      "grad_norm": 0.006928700848866776,
      "kl": 0.235107421875,
      "learning_rate": 4.5634488361324386e-07,
      "loss": 0.0002,
      "num_tokens": 9171669.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 879.5625,
      "completions/mean_terminated_length": 767.2222290039062,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 1.0833333333333333,
      "grad_norm": 0.8258267922504359,
      "kl": 0.2430419921875,
      "learning_rate": 4.560622186807628e-07,
      "loss": -0.0026,
      "num_tokens": 9206567.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 863.59375,
      "completions/mean_terminated_length": 790.6818237304688,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 1.0864197530864197,
      "grad_norm": 0.6010216095763495,
      "kl": 0.2197265625,
      "learning_rate": 4.5577872971338826e-07,
      "loss": 0.0034,
      "num_tokens": 9240642.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 784.625,
      "completions/mean_terminated_length": 729.3846435546875,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 1.0895061728395061,
      "grad_norm": 0.735612934087123,
      "kl": 0.2567138671875,
      "learning_rate": 4.554944178447816e-07,
      "loss": -0.0314,
      "num_tokens": 9271802.0,
      "reward": 0.0031897961162030697,
      "reward_std": 0.006208005361258984,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 7.199552055681124e-05,
      "rewards/logprob_reward/std": 0.00040726817678660154,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 866.3125,
      "completions/mean_terminated_length": 708.625,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 1.0925925925925926,
      "grad_norm": 0.527418541642273,
      "kl": 0.20654296875,
      "learning_rate": 4.552092842118952e-07,
      "loss": 0.0045,
      "num_tokens": 9306448.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 713.875,
      "completions/mean_terminated_length": 656.4444580078125,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 1.095679012345679,
      "grad_norm": 0.00802996047463695,
      "kl": 0.24072265625,
      "learning_rate": 4.549233299549674e-07,
      "loss": 0.0002,
      "num_tokens": 9335544.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 785.625,
      "completions/mean_terminated_length": 718.8800048828125,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 1.0987654320987654,
      "grad_norm": 0.5839103722605828,
      "kl": 0.2532958984375,
      "learning_rate": 4.546365562175184e-07,
      "loss": -0.0046,
      "num_tokens": 9367112.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 800.8125,
      "completions/mean_terminated_length": 648.1052856445312,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 1.1018518518518519,
      "grad_norm": 0.007467171241563814,
      "kl": 0.2427978515625,
      "learning_rate": 4.543489641463452e-07,
      "loss": 0.0002,
      "num_tokens": 9399534.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 889.25,
      "completions/mean_terminated_length": 770.3529663085938,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 1.1049382716049383,
      "grad_norm": 0.588361555509795,
      "kl": 0.2142333984375,
      "learning_rate": 4.540605548915175e-07,
      "loss": 0.0023,
      "num_tokens": 9434402.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 780.40625,
      "completions/mean_terminated_length": 735.2963256835938,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 1.1080246913580247,
      "grad_norm": 1.3784756902348092,
      "kl": 0.251708984375,
      "learning_rate": 4.537713296063729e-07,
      "loss": -0.091,
      "num_tokens": 9465811.0,
      "reward": 0.0033699856139719486,
      "reward_std": 0.006739971227943897,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00027220614720135927,
      "rewards/logprob_reward/std": 0.0015398304676637053,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 783.03125,
      "completions/mean_terminated_length": 727.423095703125,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 1.1111111111111112,
      "grad_norm": 0.9365005434702497,
      "kl": 0.25146484375,
      "learning_rate": 4.534812894475122e-07,
      "loss": -0.0112,
      "num_tokens": 9496884.0,
      "reward": 0.015625,
      "reward_std": 0.013466878794133663,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 841.53125,
      "completions/mean_terminated_length": 745.952392578125,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 1.1141975308641976,
      "grad_norm": 0.7835103765441798,
      "kl": 0.2340087890625,
      "learning_rate": 4.5319043557479474e-07,
      "loss": -0.0229,
      "num_tokens": 9530005.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 739.21875,
      "completions/mean_terminated_length": 659.47998046875,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 1.117283950617284,
      "grad_norm": 1.149886821199454,
      "kl": 0.26953125,
      "learning_rate": 4.5289876915133394e-07,
      "loss": -0.0284,
      "num_tokens": 9559552.0,
      "reward": 0.015625,
      "reward_std": 0.020683757960796356,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 824.5625,
      "completions/mean_terminated_length": 669.4444580078125,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 1.1203703703703705,
      "grad_norm": 0.019166753684404333,
      "kl": 0.2608642578125,
      "learning_rate": 4.5260629134349284e-07,
      "loss": 0.0003,
      "num_tokens": 9592594.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 815.40625,
      "completions/mean_terminated_length": 767.269287109375,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 512.0,
      "epoch": 1.123456790123457,
      "grad_norm": 0.8302941800160129,
      "kl": 0.2374267578125,
      "learning_rate": 4.523130033208788e-07,
      "loss": -0.0012,
      "num_tokens": 9624795.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 914.15625,
      "completions/mean_terminated_length": 804.3125,
      "completions/min_length": 543.0,
      "completions/min_terminated_length": 543.0,
      "epoch": 1.126543209876543,
      "grad_norm": 0.007455741944843248,
      "kl": 0.2080078125,
      "learning_rate": 4.520189062563393e-07,
      "loss": 0.0002,
      "num_tokens": 9661116.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 842.84375,
      "completions/mean_terminated_length": 771.95654296875,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 1.1296296296296295,
      "grad_norm": 0.007953891342227685,
      "kl": 0.2535400390625,
      "learning_rate": 4.5172400132595737e-07,
      "loss": 0.0003,
      "num_tokens": 9694599.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 877.875,
      "completions/mean_terminated_length": 790.2000122070312,
      "completions/min_length": 571.0,
      "completions/min_terminated_length": 571.0,
      "epoch": 1.132716049382716,
      "grad_norm": 1.1755357599564493,
      "kl": 0.2427978515625,
      "learning_rate": 4.514282897090464e-07,
      "loss": -0.0241,
      "num_tokens": 9729527.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 831.53125,
      "completions/mean_terminated_length": 756.2174072265625,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 1.1358024691358024,
      "grad_norm": 0.4794331986896714,
      "kl": 0.2257080078125,
      "learning_rate": 4.511317725881457e-07,
      "loss": 0.0135,
      "num_tokens": 9763080.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 892.8125,
      "completions/mean_terminated_length": 724.1428833007812,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 1.1388888888888888,
      "grad_norm": 0.9429283589487057,
      "kl": 0.222412109375,
      "learning_rate": 4.50834451149016e-07,
      "loss": 0.0209,
      "num_tokens": 9798514.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 883.53125,
      "completions/mean_terminated_length": 774.2777709960938,
      "completions/min_length": 567.0,
      "completions/min_terminated_length": 567.0,
      "epoch": 1.1419753086419753,
      "grad_norm": 0.015974949220601272,
      "kl": 0.2415771484375,
      "learning_rate": 4.505363265806342e-07,
      "loss": 0.0002,
      "num_tokens": 9833203.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 819.5,
      "completions/mean_terminated_length": 726.5454711914062,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 1.1450617283950617,
      "grad_norm": 0.00832902699087954,
      "kl": 0.237548828125,
      "learning_rate": 4.502374000751891e-07,
      "loss": 0.0002,
      "num_tokens": 9866067.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 797.34375,
      "completions/mean_terminated_length": 621.0555419921875,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 1.1481481481481481,
      "grad_norm": 0.007743588945211472,
      "kl": 0.2412109375,
      "learning_rate": 4.49937672828076e-07,
      "loss": 0.0002,
      "num_tokens": 9898042.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 785.53125,
      "completions/mean_terminated_length": 718.760009765625,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 1.1512345679012346,
      "grad_norm": 0.8878095648927022,
      "kl": 0.239013671875,
      "learning_rate": 4.4963714603789315e-07,
      "loss": -0.0012,
      "num_tokens": 9929267.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 847.78125,
      "completions/mean_terminated_length": 789.0416870117188,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 1.154320987654321,
      "grad_norm": 0.5494838061699857,
      "kl": 0.2425537109375,
      "learning_rate": 4.4933582090643516e-07,
      "loss": 0.0056,
      "num_tokens": 9963272.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 866.5,
      "completions/mean_terminated_length": 727.5294189453125,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 1.1574074074074074,
      "grad_norm": 0.8639857177081212,
      "kl": 0.2451171875,
      "learning_rate": 4.4903369863869e-07,
      "loss": 0.0069,
      "num_tokens": 9997876.0,
      "reward": 0.003503691405057907,
      "reward_std": 0.007007382810115814,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0004207683086860925,
      "rewards/logprob_reward/std": 0.0023802248761057854,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 837.5625,
      "completions/mean_terminated_length": 739.90478515625,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 1.1604938271604939,
      "grad_norm": 0.5189699901753596,
      "kl": 0.2509765625,
      "learning_rate": 4.4873078044283273e-07,
      "loss": 0.0215,
      "num_tokens": 10030938.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 769.4375,
      "completions/mean_terminated_length": 669.8261108398438,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 1.1635802469135803,
      "grad_norm": 0.6206949715555347,
      "kl": 0.249267578125,
      "learning_rate": 4.484270675302218e-07,
      "loss": -0.0048,
      "num_tokens": 10061652.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.007216878701001406,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 903.375,
      "completions/mean_terminated_length": 796.941162109375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.1666666666666667,
      "grad_norm": 0.5131021280005653,
      "kl": 0.20849609375,
      "learning_rate": 4.481225611153933e-07,
      "loss": 0.0262,
      "num_tokens": 10097412.0,
      "reward": 0.00010621514957165346,
      "reward_std": 0.00021243031369522214,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00011801683285739273,
      "rewards/logprob_reward/std": 0.0006676040356978774,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 814.75,
      "completions/mean_terminated_length": 745.0,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 1.1697530864197532,
      "grad_norm": 0.6250156569708267,
      "kl": 0.24755859375,
      "learning_rate": 4.4781726241605683e-07,
      "loss": -0.0184,
      "num_tokens": 10129588.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 798.03125,
      "completions/mean_terminated_length": 662.4500122070312,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 1.1728395061728394,
      "grad_norm": 1.4315230781531085,
      "kl": 0.2479248046875,
      "learning_rate": 4.4751117265309e-07,
      "loss": -0.0385,
      "num_tokens": 10162005.0,
      "reward": 0.006622622720897198,
      "reward_std": 0.013245245441794395,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00041402498027309775,
      "rewards/logprob_reward/std": 0.001968799391761422,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 760.03125,
      "completions/mean_terminated_length": 711.1481323242188,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 1.175925925925926,
      "grad_norm": 0.00883932204938891,
      "kl": 0.2576904296875,
      "learning_rate": 4.472042930505342e-07,
      "loss": 0.0003,
      "num_tokens": 10192274.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 839.6875,
      "completions/mean_terminated_length": 755.9091186523438,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 1.1790123456790123,
      "grad_norm": 0.03892469368776792,
      "kl": 0.2503662109375,
      "learning_rate": 4.46896624835589e-07,
      "loss": 0.0003,
      "num_tokens": 10225556.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 774.84375,
      "completions/mean_terminated_length": 728.7037353515625,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 1.1820987654320987,
      "grad_norm": 0.010711275418245406,
      "kl": 0.266357421875,
      "learning_rate": 4.465881692386078e-07,
      "loss": 0.0003,
      "num_tokens": 10256947.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 793.09375,
      "completions/mean_terminated_length": 728.4400024414062,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 1.1851851851851851,
      "grad_norm": 0.761983580989383,
      "kl": 0.2490234375,
      "learning_rate": 4.4627892749309273e-07,
      "loss": 0.0089,
      "num_tokens": 10288978.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 733.65625,
      "completions/mean_terminated_length": 679.888916015625,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 1.1882716049382716,
      "grad_norm": 1.1495846680178599,
      "kl": 0.2626953125,
      "learning_rate": 4.459689008356896e-07,
      "loss": -0.0152,
      "num_tokens": 10318591.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 783.28125,
      "completions/mean_terminated_length": 703.0416870117188,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 1.191358024691358,
      "grad_norm": 0.8158747930461432,
      "kl": 0.2667236328125,
      "learning_rate": 4.4565809050618317e-07,
      "loss": -0.0165,
      "num_tokens": 10349916.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 832.65625,
      "completions/mean_terminated_length": 788.5,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 1.1944444444444444,
      "grad_norm": 0.552951708589812,
      "kl": 0.237060546875,
      "learning_rate": 4.45346497747492e-07,
      "loss": 0.0138,
      "num_tokens": 10382705.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 725.34375,
      "completions/mean_terminated_length": 705.433349609375,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 1.1975308641975309,
      "grad_norm": 0.9796287105685971,
      "kl": 0.25048828125,
      "learning_rate": 4.450341238056634e-07,
      "loss": -0.0002,
      "num_tokens": 10412128.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 821.1875,
      "completions/mean_terminated_length": 682.4210815429688,
      "completions/min_length": 253.0,
      "completions/min_terminated_length": 253.0,
      "epoch": 1.2006172839506173,
      "grad_norm": 0.006439249094962117,
      "kl": 0.23486328125,
      "learning_rate": 4.4472096992986895e-07,
      "loss": 0.0002,
      "num_tokens": 10444734.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 849.09375,
      "completions/mean_terminated_length": 769.5909423828125,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 1.2037037037037037,
      "grad_norm": 0.008083405405338903,
      "kl": 0.245361328125,
      "learning_rate": 4.444070373723989e-07,
      "loss": 0.0002,
      "num_tokens": 10478673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 817.65625,
      "completions/mean_terminated_length": 736.9130859375,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 1.2067901234567902,
      "grad_norm": 0.3493451746942101,
      "kl": 0.23046875,
      "learning_rate": 4.4409232738865744e-07,
      "loss": 0.0107,
      "num_tokens": 10511194.0,
      "reward": 1.1146000360895414e-05,
      "reward_std": 2.2292000721790828e-05,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.2384445653879084e-05,
      "rewards/logprob_reward/std": 7.005700172157958e-05,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 843.40625,
      "completions/mean_terminated_length": 702.9444580078125,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 1.2098765432098766,
      "grad_norm": 0.008235397741820484,
      "kl": 0.2247314453125,
      "learning_rate": 4.4377684123715763e-07,
      "loss": 0.0002,
      "num_tokens": 10544707.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 842.4375,
      "completions/mean_terminated_length": 747.3333740234375,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 1.212962962962963,
      "grad_norm": 0.009912324408120282,
      "kl": 0.212890625,
      "learning_rate": 4.434605801795167e-07,
      "loss": 0.0002,
      "num_tokens": 10578045.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 866.03125,
      "completions/mean_terminated_length": 771.25,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 1.2160493827160495,
      "grad_norm": 0.5795413703576104,
      "kl": 0.20947265625,
      "learning_rate": 4.431435454804503e-07,
      "loss": 0.0015,
      "num_tokens": 10612394.0,
      "reward": 9.790882904781029e-05,
      "reward_std": 0.00019581765809562057,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00010878759348997846,
      "rewards/logprob_reward/std": 0.0006153955473564565,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 874.375,
      "completions/mean_terminated_length": 815.8261108398438,
      "completions/min_length": 579.0,
      "completions/min_terminated_length": 579.0,
      "epoch": 1.2191358024691359,
      "grad_norm": 0.009494320279332596,
      "kl": 0.241455078125,
      "learning_rate": 4.42825738407768e-07,
      "loss": 0.0002,
      "num_tokens": 10646382.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 815.15625,
      "completions/mean_terminated_length": 672.26318359375,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 1.2222222222222223,
      "grad_norm": 0.007764736406968568,
      "kl": 0.2137451171875,
      "learning_rate": 4.425071602323681e-07,
      "loss": 0.0002,
      "num_tokens": 10678903.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 784.84375,
      "completions/mean_terminated_length": 691.2608642578125,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 1.2253086419753085,
      "grad_norm": 0.006847226776488038,
      "kl": 0.235595703125,
      "learning_rate": 4.421878122282325e-07,
      "loss": 0.0002,
      "num_tokens": 10710122.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 802.25,
      "completions/mean_terminated_length": 715.478271484375,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 1.228395061728395,
      "grad_norm": 0.007805958167803609,
      "kl": 0.228759765625,
      "learning_rate": 4.4186769567242163e-07,
      "loss": 0.0002,
      "num_tokens": 10742558.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 812.375,
      "completions/mean_terminated_length": 685.4000244140625,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 1.2314814814814814,
      "grad_norm": 0.6232169916712146,
      "kl": 0.21337890625,
      "learning_rate": 4.4154681184506927e-07,
      "loss": 0.0202,
      "num_tokens": 10775026.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 876.40625,
      "completions/mean_terminated_length": 799.0952758789062,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 1.2345679012345678,
      "grad_norm": 0.8426294509652541,
      "kl": 0.2391357421875,
      "learning_rate": 4.4122516202937745e-07,
      "loss": 0.0246,
      "num_tokens": 10810183.0,
      "reward": 0.003223419887945056,
      "reward_std": 0.006446839775890112,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00010935530008282512,
      "rewards/logprob_reward/std": 0.0006186070386320353,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 834.28125,
      "completions/mean_terminated_length": 771.0416870117188,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 1.2376543209876543,
      "grad_norm": 0.007790026200100888,
      "kl": 0.25146484375,
      "learning_rate": 4.4090274751161144e-07,
      "loss": 0.0003,
      "num_tokens": 10843284.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 866.625,
      "completions/mean_terminated_length": 744.2222290039062,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 1.2407407407407407,
      "grad_norm": 0.5148751677044905,
      "kl": 0.230712890625,
      "learning_rate": 4.4057956958109453e-07,
      "loss": 0.0217,
      "num_tokens": 10877660.0,
      "reward": 4.3019586883019656e-05,
      "reward_std": 8.603917376603931e-05,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 4.7799541789572686e-05,
      "rewards/logprob_reward/std": 0.0002703950449358672,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 866.15625,
      "completions/mean_terminated_length": 726.8823852539062,
      "completions/min_length": 556.0,
      "completions/min_terminated_length": 556.0,
      "epoch": 1.2438271604938271,
      "grad_norm": 0.5816434254295519,
      "kl": 0.220458984375,
      "learning_rate": 4.402556295302029e-07,
      "loss": 0.0167,
      "num_tokens": 10912053.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.007216878701001406,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 776.53125,
      "completions/mean_terminated_length": 694.0416870117188,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 1.2469135802469136,
      "grad_norm": 0.6479712483848744,
      "kl": 0.24169921875,
      "learning_rate": 4.3993092865436035e-07,
      "loss": 0.0093,
      "num_tokens": 10943290.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.007216878701001406,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 783.0,
      "completions/mean_terminated_length": 702.6666870117188,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 1.25,
      "grad_norm": 0.7618585239127673,
      "kl": 0.2161865234375,
      "learning_rate": 4.3960546825203304e-07,
      "loss": -0.0298,
      "num_tokens": 10974822.0,
      "reward": 0.003186756744980812,
      "reward_std": 0.006209921091794968,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 6.861852307338268e-05,
      "rewards/logprob_reward/std": 0.0003881649754475802,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 818.3125,
      "completions/mean_terminated_length": 724.8181762695312,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 1.2530864197530864,
      "grad_norm": 0.007887791782644819,
      "kl": 0.2255859375,
      "learning_rate": 4.392792496247248e-07,
      "loss": 0.0002,
      "num_tokens": 11008236.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 809.65625,
      "completions/mean_terminated_length": 738.2083740234375,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 1.2561728395061729,
      "grad_norm": 0.6310960871369747,
      "kl": 0.225341796875,
      "learning_rate": 4.3895227407697135e-07,
      "loss": 0.0222,
      "num_tokens": 11040653.0,
      "reward": 0.0001372906262986362,
      "reward_std": 0.0002745812525972724,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0001525451516499743,
      "rewards/logprob_reward/std": 0.0008629257208667696,
      "step": 407
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 792.625,
      "completions/mean_terminated_length": 739.2307739257812,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 1.2592592592592593,
      "grad_norm": 0.005567178329840646,
      "kl": NaN,
      "learning_rate": 4.3862454291633523e-07,
      "loss": 0.0002,
      "num_tokens": 11072561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 831.1875,
      "completions/mean_terminated_length": 699.26318359375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.2623456790123457,
      "grad_norm": 137.7828505628255,
      "kl": 60.7132568359375,
      "learning_rate": 4.382960574534009e-07,
      "loss": 0.0994,
      "num_tokens": 11106103.0,
      "reward": 0.0034079812467098236,
      "reward_std": 0.006815962493419647,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00031442352337762713,
      "rewards/logprob_reward/std": 0.0017786480020731688,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 832.875,
      "completions/mean_terminated_length": 758.0869750976562,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 1.2654320987654322,
      "grad_norm": 0.7507446008491451,
      "kl": 0.230224609375,
      "learning_rate": 4.3796681900176903e-07,
      "loss": 0.0227,
      "num_tokens": 11139267.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 817.65625,
      "completions/mean_terminated_length": 770.0385131835938,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 1.2685185185185186,
      "grad_norm": 0.8242287130162602,
      "kl": 0.2332763671875,
      "learning_rate": 4.3763682887805153e-07,
      "loss": 0.001,
      "num_tokens": 11172004.0,
      "reward": 0.00010974896576954052,
      "reward_std": 0.00021949793153908104,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0001219432961079292,
      "rewards/logprob_reward/std": 0.0006898154970258474,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 792.25,
      "completions/mean_terminated_length": 701.5652465820312,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 1.2716049382716048,
      "grad_norm": 0.6068112356341268,
      "kl": 0.233154296875,
      "learning_rate": 4.3730608840186625e-07,
      "loss": 0.0106,
      "num_tokens": 11204188.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.007216878701001406,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 797.125,
      "completions/mean_terminated_length": 694.0,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 1.2746913580246915,
      "grad_norm": 0.8086582079593566,
      "kl": 0.2498779296875,
      "learning_rate": 4.3697459889583166e-07,
      "loss": 0.0192,
      "num_tokens": 11236384.0,
      "reward": 0.0001524329709354788,
      "reward_std": 0.0003048659418709576,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00016936997417360544,
      "rewards/logprob_reward/std": 0.0008339976775459945,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 819.03125,
      "completions/mean_terminated_length": 771.7307739257812,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 1.2777777777777777,
      "grad_norm": 0.7641636281958442,
      "kl": 0.2178955078125,
      "learning_rate": 4.366423616855615e-07,
      "loss": -0.0009,
      "num_tokens": 11268933.0,
      "reward": 0.0031583672389388084,
      "reward_std": 0.006228073500096798,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 3.707461655722e-05,
      "rewards/logprob_reward/std": 0.00020972569473087788,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 757.8125,
      "completions/mean_terminated_length": 719.7857666015625,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 1.2808641975308643,
      "grad_norm": 0.9999706539197956,
      "kl": 0.244384765625,
      "learning_rate": 4.363093780996596e-07,
      "loss": 0.0114,
      "num_tokens": 11299483.0,
      "reward": 0.006281028036028147,
      "reward_std": 0.007278934586793184,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 3.447556446189992e-05,
      "rewards/logprob_reward/std": 0.00019502323993947357,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 798.21875,
      "completions/mean_terminated_length": 756.4074096679688,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 1.2839506172839505,
      "grad_norm": 1.0122308107373106,
      "kl": 0.243896484375,
      "learning_rate": 4.359756494697146e-07,
      "loss": 0.01,
      "num_tokens": 11331722.0,
      "reward": 0.0033252749126404524,
      "reward_std": 0.006650549825280905,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0002225275820819661,
      "rewards/logprob_reward/std": 0.0012588060926645994,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 693.25,
      "completions/mean_terminated_length": 659.0344848632812,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 1.287037037037037,
      "grad_norm": 0.9867934630640751,
      "kl": 0.2191162109375,
      "learning_rate": 4.356411771302944e-07,
      "loss": -0.0119,
      "num_tokens": 11360202.0,
      "reward": 0.009411254897713661,
      "reward_std": 0.013443084433674812,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 4.028295734315179e-05,
      "rewards/logprob_reward/std": 0.00022787481429986656,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 820.25,
      "completions/mean_terminated_length": 782.5185546875,
      "completions/min_length": 548.0,
      "completions/min_terminated_length": 548.0,
      "epoch": 1.2901234567901234,
      "grad_norm": 0.8168515073126328,
      "kl": 0.2159423828125,
      "learning_rate": 4.353059624189411e-07,
      "loss": -0.001,
      "num_tokens": 11393438.0,
      "reward": 0.00036042043939232826,
      "reward_std": 0.0007208408787846565,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00040046716458164155,
      "rewards/logprob_reward/std": 0.0022653844207525253,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 845.0,
      "completions/mean_terminated_length": 785.3333740234375,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 1.2932098765432098,
      "grad_norm": 0.9737864152659005,
      "kl": 0.203369140625,
      "learning_rate": 4.3497000667616534e-07,
      "loss": -0.0195,
      "num_tokens": 11427646.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 828.5,
      "completions/mean_terminated_length": 763.3333740234375,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 1.2962962962962963,
      "grad_norm": 0.7791225374329636,
      "kl": 0.19439697265625,
      "learning_rate": 4.346333112454413e-07,
      "loss": 0.0309,
      "num_tokens": 11460646.0,
      "reward": 0.0001546065614093095,
      "reward_std": 0.000309213122818619,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00017178506823256612,
      "rewards/logprob_reward/std": 0.0008412457536906004,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 710.15625,
      "completions/mean_terminated_length": 689.2333984375,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 1.2993827160493827,
      "grad_norm": 0.9427759344956138,
      "kl": 0.2515869140625,
      "learning_rate": 4.342958774732011e-07,
      "loss": -0.0213,
      "num_tokens": 11489603.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 741.4375,
      "completions/mean_terminated_length": 662.3200073242188,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 1.3024691358024691,
      "grad_norm": 0.7831719097110312,
      "kl": 0.2220458984375,
      "learning_rate": 4.3395770670882935e-07,
      "loss": -0.0053,
      "num_tokens": 11520029.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 844.46875,
      "completions/mean_terminated_length": 803.0385131835938,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 1.3055555555555556,
      "grad_norm": 0.006609870276807969,
      "kl": 0.2305908203125,
      "learning_rate": 4.3361880030465803e-07,
      "loss": 0.0002,
      "num_tokens": 11553660.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 799.6875,
      "completions/mean_terminated_length": 697.727294921875,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 1.308641975308642,
      "grad_norm": 1.2247530200330305,
      "kl": 0.227783203125,
      "learning_rate": 4.3327915961596066e-07,
      "loss": -0.1106,
      "num_tokens": 11585826.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 424
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 762.46875,
      "completions/mean_terminated_length": 714.0370483398438,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 1.3117283950617284,
      "grad_norm": 0.98173031770165,
      "kl": NaN,
      "learning_rate": 4.3293878600094746e-07,
      "loss": -0.0167,
      "num_tokens": 11616589.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 782.59375,
      "completions/mean_terminated_length": 726.8846435546875,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 1.3148148148148149,
      "grad_norm": 1.3587790999170124,
      "kl": 0.22607421875,
      "learning_rate": 4.325976808207594e-07,
      "loss": -0.0105,
      "num_tokens": 11648068.0,
      "reward": 0.006669376045465469,
      "reward_std": 0.01258049812167883,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0004659731639549136,
      "rewards/logprob_reward/std": 0.001984819769859314,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 750.15625,
      "completions/mean_terminated_length": 673.47998046875,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 1.3179012345679013,
      "grad_norm": 0.7013206768494553,
      "kl": 0.218994140625,
      "learning_rate": 4.3225584543946303e-07,
      "loss": -0.0138,
      "num_tokens": 11678481.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 739.1875,
      "completions/mean_terminated_length": 709.72412109375,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 1.3209876543209877,
      "grad_norm": 0.736204130156878,
      "kl": 0.2166748046875,
      "learning_rate": 4.319132812240448e-07,
      "loss": -0.011,
      "num_tokens": 11708567.0,
      "reward": 0.00011441440437920392,
      "reward_std": 0.00022882880875840783,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0001271271175937727,
      "rewards/logprob_reward/std": 0.0007191395852714777,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 695.0625,
      "completions/mean_terminated_length": 602.9599609375,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 1.324074074074074,
      "grad_norm": 1.641092297738994,
      "kl": 0.22650146484375,
      "learning_rate": 4.3156998954440587e-07,
      "loss": -0.0958,
      "num_tokens": 11737389.0,
      "reward": 0.003597402013838291,
      "reward_std": 0.0068130940198898315,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0005248911329545081,
      "rewards/logprob_reward/std": 0.0022221512626856565,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 681.625,
      "completions/mean_terminated_length": 632.7142944335938,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 1.3271604938271606,
      "grad_norm": 1.575642809217235,
      "kl": 0.261474609375,
      "learning_rate": 4.312259717733565e-07,
      "loss": -0.1046,
      "num_tokens": 11765653.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 730.4375,
      "completions/mean_terminated_length": 632.5833740234375,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 1.3302469135802468,
      "grad_norm": 0.04890130949547042,
      "kl": 0.226806640625,
      "learning_rate": 4.308812292866105e-07,
      "loss": 0.0002,
      "num_tokens": 11795639.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 691.78125,
      "completions/mean_terminated_length": 669.6333618164062,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 1.3333333333333333,
      "grad_norm": 0.9286398876317612,
      "kl": 0.2548828125,
      "learning_rate": 4.3053576346277997e-07,
      "loss": 0.0227,
      "num_tokens": 11823924.0,
      "reward": 0.006325773429125547,
      "reward_std": 0.007368425372987986,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 8.419268124271184e-05,
      "rewards/logprob_reward/std": 0.00047626570449210703,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 786.875,
      "completions/mean_terminated_length": 694.0869750976562,
      "completions/min_length": 377.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 1.3364197530864197,
      "grad_norm": 1.2072542458909037,
      "kl": 0.2181396484375,
      "learning_rate": 4.301895756833692e-07,
      "loss": -0.0002,
      "num_tokens": 11855800.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 718.5625,
      "completions/mean_terminated_length": 648.0769653320312,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 1.3395061728395061,
      "grad_norm": 1.3281527836782274,
      "kl": 0.22998046875,
      "learning_rate": 4.298426673327701e-07,
      "loss": -0.0395,
      "num_tokens": 11885422.0,
      "reward": 0.006339985877275467,
      "reward_std": 0.012679971754550934,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 9.998405585065484e-05,
      "rewards/logprob_reward/std": 0.0004191905609332025,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 782.25,
      "completions/mean_terminated_length": 726.4615478515625,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 1.3425925925925926,
      "grad_norm": 1.4341930099899078,
      "kl": 0.24560546875,
      "learning_rate": 4.2949503979825563e-07,
      "loss": -0.0136,
      "num_tokens": 11916806.0,
      "reward": 0.00984628964215517,
      "reward_std": 0.018601160496473312,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0005236548604443669,
      "rewards/logprob_reward/std": 0.002705881604924798,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 803.125,
      "completions/mean_terminated_length": 741.2799682617188,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 1.345679012345679,
      "grad_norm": 0.8642238947769881,
      "kl": 0.232421875,
      "learning_rate": 4.2914669446997504e-07,
      "loss": -0.0011,
      "num_tokens": 11948830.0,
      "reward": 0.006460936740040779,
      "reward_std": 0.01237230934202671,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0002343738597119227,
      "rewards/logprob_reward/std": 0.001325818826444447,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 710.875,
      "completions/mean_terminated_length": 652.888916015625,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "epoch": 1.3487654320987654,
      "grad_norm": 1.0281930220539874,
      "kl": 0.22412109375,
      "learning_rate": 4.287976327409478e-07,
      "loss": 0.0191,
      "num_tokens": 11977982.0,
      "reward": 0.006372864358127117,
      "reward_std": 0.012745728716254234,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00013651592598762363,
      "rewards/logprob_reward/std": 0.0007722506416030228,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 770.34375,
      "completions/mean_terminated_length": 711.8077392578125,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 1.3518518518518519,
      "grad_norm": 1.4499981004239422,
      "kl": 0.2191162109375,
      "learning_rate": 4.284478560070585e-07,
      "loss": -0.0696,
      "num_tokens": 12009161.0,
      "reward": 0.00633084774017334,
      "reward_std": 0.01266169548034668,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 8.983034058474004e-05,
      "rewards/logprob_reward/std": 0.0005081571289338171,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 807.0625,
      "completions/mean_terminated_length": 693.4285888671875,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 1.3549382716049383,
      "grad_norm": 0.7341836753699728,
      "kl": 0.2386474609375,
      "learning_rate": 4.280973656670508e-07,
      "loss": 0.0018,
      "num_tokens": 12041515.0,
      "reward": 0.0033574083354324102,
      "reward_std": 0.006110795307904482,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0002582314482424408,
      "rewards/logprob_reward/std": 0.0014607777120545506,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 805.5,
      "completions/mean_terminated_length": 706.1818237304688,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 1.3580246913580247,
      "grad_norm": 1.2039689263705495,
      "kl": 0.2305908203125,
      "learning_rate": 4.277461631225221e-07,
      "loss": -0.0319,
      "num_tokens": 12074411.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 709.1875,
      "completions/mean_terminated_length": 664.2142944335938,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 1.3611111111111112,
      "grad_norm": 0.9632888697626055,
      "kl": 0.258544921875,
      "learning_rate": 4.2739424977791784e-07,
      "loss": 0.0093,
      "num_tokens": 12103149.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 702.625,
      "completions/mean_terminated_length": 669.3793334960938,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 1.3641975308641976,
      "grad_norm": 1.4351619860940739,
      "kl": 0.2476806640625,
      "learning_rate": 4.2704162704052594e-07,
      "loss": -0.0805,
      "num_tokens": 12132189.0,
      "reward": 0.006343253888189793,
      "reward_std": 0.012686507776379585,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00010361517342971638,
      "rewards/logprob_reward/std": 0.0005861359531991184,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 756.71875,
      "completions/mean_terminated_length": 729.0689697265625,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 1.367283950617284,
      "grad_norm": 1.0343246403227044,
      "kl": 0.2568359375,
      "learning_rate": 4.2668829632047124e-07,
      "loss": -0.0203,
      "num_tokens": 12162592.0,
      "reward": 0.003239110577851534,
      "reward_std": 0.006478221155703068,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0001267895131604746,
      "rewards/logprob_reward/std": 0.0007172297919169068,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 714.3125,
      "completions/mean_terminated_length": 656.9629516601562,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 1.3703703703703702,
      "grad_norm": 0.03445418685545314,
      "kl": 0.25927734375,
      "learning_rate": 4.2633425903070973e-07,
      "loss": 0.0003,
      "num_tokens": 12191910.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 763.875,
      "completions/mean_terminated_length": 726.7142944335938,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 1.373456790123457,
      "grad_norm": 0.8898686519862932,
      "kl": 0.2269287109375,
      "learning_rate": 4.259795165870229e-07,
      "loss": 0.003,
      "num_tokens": 12222826.0,
      "reward": 0.0063004931434988976,
      "reward_std": 0.007317864801734686,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 5.610326479654759e-05,
      "rewards/logprob_reward/std": 0.00031736798700876534,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 731.65625,
      "completions/mean_terminated_length": 677.5184936523438,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 1.376543209876543,
      "grad_norm": 0.8783443173591432,
      "kl": 0.218505859375,
      "learning_rate": 4.256240704080121e-07,
      "loss": -0.0138,
      "num_tokens": 12252615.0,
      "reward": 0.0001754740223987028,
      "reward_std": 0.0003509480447974056,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00019497114408295602,
      "rewards/logprob_reward/std": 0.0011029232991859317,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 805.65625,
      "completions/mean_terminated_length": 706.4091186523438,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 1.3796296296296298,
      "grad_norm": 0.8940524822228907,
      "kl": 0.253173828125,
      "learning_rate": 4.2526792191509297e-07,
      "loss": 0.0231,
      "num_tokens": 12285108.0,
      "reward": 0.0034149284474551678,
      "reward_std": 0.0068298568949103355,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00032214270322583616,
      "rewards/logprob_reward/std": 0.0018223143415525556,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 836.96875,
      "completions/mean_terminated_length": 784.5999755859375,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 1.382716049382716,
      "grad_norm": 0.007217653048510719,
      "kl": 0.229736328125,
      "learning_rate": 4.249110725324897e-07,
      "loss": 0.0002,
      "num_tokens": 12318715.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 808.625,
      "completions/mean_terminated_length": 758.923095703125,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 1.3858024691358024,
      "grad_norm": 1.0950945919731718,
      "kl": 0.2406005859375,
      "learning_rate": 4.2455352368722916e-07,
      "loss": 0.0005,
      "num_tokens": 12351235.0,
      "reward": 0.009825151413679123,
      "reward_std": 0.018510140478610992,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0005001676036044955,
      "rewards/logprob_reward/std": 0.002829375211149454,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 786.25,
      "completions/mean_terminated_length": 731.3846435546875,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.3888888888888888,
      "grad_norm": 3.214026215003321,
      "kl": 0.47955322265625,
      "learning_rate": 4.2419527680913554e-07,
      "loss": -0.1991,
      "num_tokens": 12383055.0,
      "reward": 0.006404605228453875,
      "reward_std": 0.01280921045690775,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00017178348207380623,
      "rewards/logprob_reward/std": 0.0009717540815472603,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 771.6875,
      "completions/mean_terminated_length": 713.4615478515625,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 1.3919753086419753,
      "grad_norm": 0.7936696463586366,
      "kl": 0.2222900390625,
      "learning_rate": 4.2383633333082423e-07,
      "loss": 0.0021,
      "num_tokens": 12414541.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 715.75,
      "completions/mean_terminated_length": 671.7142944335938,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 1.3950617283950617,
      "grad_norm": 3.459592763290137,
      "kl": 0.55084228515625,
      "learning_rate": 4.234766946876965e-07,
      "loss": 0.0037,
      "num_tokens": 12444073.0,
      "reward": 0.0062960912473499775,
      "reward_std": 0.012469880282878876,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 5.121239155414514e-05,
      "rewards/logprob_reward/std": 0.00028970104176551104,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 706.0625,
      "completions/mean_terminated_length": 632.6923217773438,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 1.3981481481481481,
      "grad_norm": 1.3456803869912601,
      "kl": 0.2529296875,
      "learning_rate": 4.231163623179335e-07,
      "loss": -0.0036,
      "num_tokens": 12472799.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 727.3125,
      "completions/mean_terminated_length": 672.370361328125,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 1.4012345679012346,
      "grad_norm": 1.6723204574364663,
      "kl": 0.218505859375,
      "learning_rate": 4.227553376624904e-07,
      "loss": 0.0056,
      "num_tokens": 12502501.0,
      "reward": 0.003401299240067601,
      "reward_std": 0.006802598480135202,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00030699922353960574,
      "rewards/logprob_reward/std": 0.0010323910973966122,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 656.65625,
      "completions/mean_terminated_length": 604.1785888671875,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 1.404320987654321,
      "grad_norm": 1.6559128796505724,
      "kl": 0.240478515625,
      "learning_rate": 4.22393622165091e-07,
      "loss": 0.0077,
      "num_tokens": 12529674.0,
      "reward": 0.009458878077566624,
      "reward_std": 0.018917756155133247,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 9.31981485337019e-05,
      "rewards/logprob_reward/std": 0.00037146464455872774,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 761.34375,
      "completions/mean_terminated_length": 712.7037353515625,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 1.4074074074074074,
      "grad_norm": 0.5264237404210453,
      "kl": 0.2056884765625,
      "learning_rate": 4.220312172722216e-07,
      "loss": -0.0058,
      "num_tokens": 12560489.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.007216878701001406,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 867.0625,
      "completions/mean_terminated_length": 759.6842041015625,
      "completions/min_length": 590.0,
      "completions/min_terminated_length": 590.0,
      "epoch": 1.4104938271604939,
      "grad_norm": 1.075088647657135,
      "kl": 0.224609375,
      "learning_rate": 4.216681244331256e-07,
      "loss": -0.1083,
      "num_tokens": 12595327.0,
      "reward": 0.006331074051558971,
      "reward_std": 0.007311693392693996,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 9.008179040392861e-05,
      "rewards/logprob_reward/std": 0.0005095795495435596,
      "step": 457
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 687.375,
      "completions/mean_terminated_length": 609.6923217773438,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 1.4135802469135803,
      "grad_norm": 0.41564115682942565,
      "kl": NaN,
      "learning_rate": 4.2130434509979714e-07,
      "loss": 0.0154,
      "num_tokens": 12623371.0,
      "reward": 8.736336894799024e-06,
      "reward_std": 1.7472673789598048e-05,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.707041499495972e-06,
      "rewards/logprob_reward/std": 5.491132105817087e-05,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 718.8125,
      "completions/mean_terminated_length": 687.2413940429688,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 1.4166666666666667,
      "grad_norm": 1.2057762501743143,
      "kl": 0.2008056640625,
      "learning_rate": 4.209398807269758e-07,
      "loss": 0.0058,
      "num_tokens": 12653257.0,
      "reward": 0.0063441055826842785,
      "reward_std": 0.012688211165368557,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00010456141899339855,
      "rewards/logprob_reward/std": 0.0005914886714890599,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 761.40625,
      "completions/mean_terminated_length": 712.7777709960938,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 1.4197530864197532,
      "grad_norm": 0.869286542617104,
      "kl": 0.2095947265625,
      "learning_rate": 4.205747327721407e-07,
      "loss": -0.0244,
      "num_tokens": 12684330.0,
      "reward": 0.00018575230205897242,
      "reward_std": 0.00037150460411794484,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00020639145805034786,
      "rewards/logprob_reward/std": 0.0011675263522192836,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 743.0625,
      "completions/mean_terminated_length": 702.9285888671875,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 1.4228395061728394,
      "grad_norm": 1.0774651339855361,
      "kl": 0.2396240234375,
      "learning_rate": 4.2020890269550454e-07,
      "loss": 0.0084,
      "num_tokens": 12714532.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 713.40625,
      "completions/mean_terminated_length": 692.7000122070312,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 1.425925925925926,
      "grad_norm": 1.3919502874150627,
      "kl": 0.248291015625,
      "learning_rate": 4.198423919600076e-07,
      "loss": -0.043,
      "num_tokens": 12743541.0,
      "reward": 0.0095895417034626,
      "reward_std": 0.018620356917381287,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.00023837909975554794,
      "rewards/logprob_reward/std": 0.0013484758092090487,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 717.1875,
      "completions/mean_terminated_length": 673.357177734375,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 1.4290123456790123,
      "grad_norm": 0.9788629478908946,
      "kl": 0.2410888671875,
      "learning_rate": 4.1947520203131217e-07,
      "loss": -0.0199,
      "num_tokens": 12773231.0,
      "reward": 0.0031524146907031536,
      "reward_std": 0.006304829381406307,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 3.0460822017630562e-05,
      "rewards/logprob_reward/std": 0.00017231242964044213,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 783.5,
      "completions/mean_terminated_length": 689.3912963867188,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.4320987654320987,
      "grad_norm": 0.008039119546979199,
      "kl": 0.2203369140625,
      "learning_rate": 4.191073343777968e-07,
      "loss": 0.0002,
      "num_tokens": 12805019.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 692.375,
      "completions/mean_terminated_length": 645.0,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 1.4351851851851851,
      "grad_norm": 1.0947998540882207,
      "kl": 0.2392578125,
      "learning_rate": 4.1873879047055005e-07,
      "loss": -0.0068,
      "num_tokens": 12833295.0,
      "reward": 0.003245285712182522,
      "reward_std": 0.006490571424365044,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00013365063932724297,
      "rewards/logprob_reward/std": 0.0007560421363450587,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 705.3125,
      "completions/mean_terminated_length": 684.0667114257812,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 1.4382716049382716,
      "grad_norm": 2.753381604776745,
      "kl": 0.222900390625,
      "learning_rate": 4.183695717833649e-07,
      "loss": -0.2018,
      "num_tokens": 12862265.0,
      "reward": 0.006487657316029072,
      "reward_std": 0.012975314632058144,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00026406353572383523,
      "rewards/logprob_reward/std": 0.0014937689993530512,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 649.28125,
      "completions/mean_terminated_length": 624.300048828125,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 1.441358024691358,
      "grad_norm": 1.1900591830657552,
      "kl": 0.22802734375,
      "learning_rate": 4.179996797927326e-07,
      "loss": 0.0072,
      "num_tokens": 12889074.0,
      "reward": 0.0064575402066111565,
      "reward_std": 0.012915080413222313,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00023059980594553053,
      "rewards/logprob_reward/std": 0.0013044695369899273,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 743.9375,
      "completions/mean_terminated_length": 703.9285888671875,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 1.4444444444444444,
      "grad_norm": 1.6501266285285199,
      "kl": 0.2314453125,
      "learning_rate": 4.17629115977837e-07,
      "loss": -0.1072,
      "num_tokens": 12919324.0,
      "reward": 0.00937500037252903,
      "reward_std": 0.01875000074505806,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 816.15625,
      "completions/mean_terminated_length": 721.6818237304688,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 1.4475308641975309,
      "grad_norm": 0.008942735752603462,
      "kl": 0.22998046875,
      "learning_rate": 4.1725788182054867e-07,
      "loss": 0.0002,
      "num_tokens": 12952597.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 728.3125,
      "completions/mean_terminated_length": 660.0769653320312,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 1.4506172839506173,
      "grad_norm": 1.7481401793763334,
      "kl": 0.2347412109375,
      "learning_rate": 4.1688597880541863e-07,
      "loss": 0.0162,
      "num_tokens": 12982287.0,
      "reward": 0.009573226794600487,
      "reward_std": 0.019079623743891716,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0002202522591687739,
      "rewards/logprob_reward/std": 0.000702223158441484,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 706.1875,
      "completions/mean_terminated_length": 685.0000610351562,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 1.4537037037037037,
      "grad_norm": 1.326458316628465,
      "kl": 0.220947265625,
      "learning_rate": 4.1651340841967284e-07,
      "loss": -0.0554,
      "num_tokens": 13010945.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 755.0625,
      "completions/mean_terminated_length": 679.760009765625,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 1.4567901234567902,
      "grad_norm": 0.6618278332040402,
      "kl": 0.210693359375,
      "learning_rate": 4.161401721532059e-07,
      "loss": 0.0077,
      "num_tokens": 13042015.0,
      "reward": 4.2685020162025467e-05,
      "reward_std": 8.537004032405093e-05,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 4.7427802201127633e-05,
      "rewards/logprob_reward/std": 0.00026829217677004635,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 626.3125,
      "completions/mean_terminated_length": 613.4838256835938,
      "completions/min_length": 269.0,
      "completions/min_terminated_length": 269.0,
      "epoch": 1.4598765432098766,
      "grad_norm": 1.1416619745506837,
      "kl": 0.266845703125,
      "learning_rate": 4.1576627149857513e-07,
      "loss": -0.0454,
      "num_tokens": 13068481.0,
      "reward": 0.0032101499382406473,
      "reward_std": 0.006420299876481295,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 9.461108857067302e-05,
      "rewards/logprob_reward/std": 0.0005352011066861451,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 729.4375,
      "completions/mean_terminated_length": 674.888916015625,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 1.462962962962963,
      "grad_norm": 0.9798319736211897,
      "kl": 0.2449951171875,
      "learning_rate": 4.153917079509952e-07,
      "loss": -0.0141,
      "num_tokens": 13098375.0,
      "reward": 9.32246693992056e-05,
      "reward_std": 0.0001864493387984112,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.000103582970041316,
      "rewards/logprob_reward/std": 0.0005859537632204592,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 745.53125,
      "completions/mean_terminated_length": 705.7500610351562,
      "completions/min_length": 301.0,
      "completions/min_terminated_length": 301.0,
      "epoch": 1.4660493827160495,
      "grad_norm": 0.6640956393226755,
      "kl": 0.219482421875,
      "learning_rate": 4.150164830083311e-07,
      "loss": 0.0052,
      "num_tokens": 13128980.0,
      "reward": 0.0032039100769907236,
      "reward_std": 0.006199179217219353,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 8.767777762841433e-05,
      "rewards/logprob_reward/std": 0.0004959804355166852,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 699.75,
      "completions/mean_terminated_length": 678.1333618164062,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 1.4691358024691357,
      "grad_norm": 0.00856726162889787,
      "kl": 0.2303466796875,
      "learning_rate": 4.146405981710931e-07,
      "loss": 0.0002,
      "num_tokens": 13157432.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 868.0,
      "completions/mean_length": 740.03125,
      "completions/mean_terminated_length": 660.5199584960938,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 1.4722222222222223,
      "grad_norm": 0.7604375052940118,
      "kl": 0.215576171875,
      "learning_rate": 4.142640549424302e-07,
      "loss": -0.0034,
      "num_tokens": 13187909.0,
      "reward": 2.138299350917805e-05,
      "reward_std": 4.27659870183561e-05,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 2.3758882889524102e-05,
      "rewards/logprob_reward/std": 0.0001344005431747064,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 786.4375,
      "completions/mean_terminated_length": 693.478271484375,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 1.4753086419753085,
      "grad_norm": 0.7540158733026404,
      "kl": 0.19873046875,
      "learning_rate": 4.1388685482812413e-07,
      "loss": -0.0112,
      "num_tokens": 13219503.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 732.46875,
      "completions/mean_terminated_length": 650.8399658203125,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 1.4783950617283952,
      "grad_norm": 1.2765891413572787,
      "kl": 0.21826171875,
      "learning_rate": 4.135089993365839e-07,
      "loss": 0.0176,
      "num_tokens": 13249258.0,
      "reward": 0.0002850447781383991,
      "reward_std": 0.0005700895562767982,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00031671643955633044,
      "rewards/logprob_reward/std": 0.0011264249915257096,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 734.40625,
      "completions/mean_terminated_length": 715.1000366210938,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 1.4814814814814814,
      "grad_norm": 1.4233734996648617,
      "kl": 0.19580078125,
      "learning_rate": 4.131304899788389e-07,
      "loss": -0.0326,
      "num_tokens": 13279483.0,
      "reward": 0.0003334644134156406,
      "reward_std": 0.0006669288268312812,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00037051603430882096,
      "rewards/logprob_reward/std": 0.0012172131100669503,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 681.125,
      "completions/mean_terminated_length": 602.0,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "epoch": 1.4845679012345678,
      "grad_norm": 0.007854875065843677,
      "kl": 0.235107421875,
      "learning_rate": 4.127513282685336e-07,
      "loss": 0.0002,
      "num_tokens": 13307583.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 763.65625,
      "completions/mean_terminated_length": 703.5769653320312,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 1.4876543209876543,
      "grad_norm": 1.3271718238623136,
      "kl": 0.206787109375,
      "learning_rate": 4.123715157219211e-07,
      "loss": -0.0084,
      "num_tokens": 13338964.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 667.3125,
      "completions/mean_terminated_length": 643.5333862304688,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 1.4907407407407407,
      "grad_norm": 0.5993122360613606,
      "kl": 0.217041015625,
      "learning_rate": 4.1199105385785727e-07,
      "loss": -0.0016,
      "num_tokens": 13366546.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 768.59375,
      "completions/mean_terminated_length": 732.107177734375,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 1.4938271604938271,
      "grad_norm": 0.006839411443825314,
      "kl": 0.2059326171875,
      "learning_rate": 4.116099441977943e-07,
      "loss": 0.0002,
      "num_tokens": 13397565.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 755.28125,
      "completions/mean_terminated_length": 716.8928833007812,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 1.4969135802469136,
      "grad_norm": 1.1233779045963534,
      "kl": 0.2186279296875,
      "learning_rate": 4.112281882657751e-07,
      "loss": 0.0319,
      "num_tokens": 13428318.0,
      "reward": 0.006400207057595253,
      "reward_std": 0.0075172921642661095,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0001668964105192572,
      "rewards/logprob_reward/std": 0.0006705340929329395,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 734.03125,
      "completions/mean_terminated_length": 692.607177734375,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 1.5,
      "grad_norm": 0.008426954253993934,
      "kl": 0.205322265625,
      "learning_rate": 4.1084578758842714e-07,
      "loss": 0.0002,
      "num_tokens": 13457907.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 854.0,
      "completions/mean_length": 658.59375,
      "completions/mean_terminated_length": 590.9259033203125,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 1.5030864197530864,
      "grad_norm": 1.6402412231893142,
      "kl": 0.25927734375,
      "learning_rate": 4.104627436949559e-07,
      "loss": 0.0074,
      "num_tokens": 13485362.0,
      "reward": 0.012500000186264515,
      "reward_std": 0.02500000037252903,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 712.125,
      "completions/mean_terminated_length": 679.862060546875,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 1.5061728395061729,
      "grad_norm": 0.6426115974868853,
      "kl": 0.2218017578125,
      "learning_rate": 4.1007905811713915e-07,
      "loss": 0.0205,
      "num_tokens": 13514342.0,
      "reward": 8.19168271846138e-05,
      "reward_std": 0.0001638336543692276,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.101870091399178e-05,
      "rewards/logprob_reward/std": 0.0005148795316927135,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 643.0625,
      "completions/mean_terminated_length": 603.6551513671875,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 1.5092592592592593,
      "grad_norm": 0.8659957852460237,
      "kl": 0.22216796875,
      "learning_rate": 4.096947323893209e-07,
      "loss": -0.0122,
      "num_tokens": 13540668.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 701.40625,
      "completions/mean_terminated_length": 641.6666870117188,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 1.5123456790123457,
      "grad_norm": 1.2243377483078597,
      "kl": 0.2393798828125,
      "learning_rate": 4.0930976804840487e-07,
      "loss": 0.0068,
      "num_tokens": 13569205.0,
      "reward": 0.0032299254089593887,
      "reward_std": 0.0064598508179187775,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0001165838330052793,
      "rewards/logprob_reward/std": 0.0004594208439812064,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 744.53125,
      "completions/mean_terminated_length": 692.7777709960938,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 1.515432098765432,
      "grad_norm": 0.9748368726692964,
      "kl": 0.239501953125,
      "learning_rate": 4.0892416663384874e-07,
      "loss": 0.0009,
      "num_tokens": 13599022.0,
      "reward": 0.0032928246073424816,
      "reward_std": 0.006585649214684963,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00018647167598828673,
      "rewards/logprob_reward/std": 0.0010548430727794766,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 653.09375,
      "completions/mean_terminated_length": 628.36669921875,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 1.5185185185185186,
      "grad_norm": 0.910886561519072,
      "kl": 0.23583984375,
      "learning_rate": 4.0853792968765765e-07,
      "loss": 0.0509,
      "num_tokens": 13626137.0,
      "reward": 0.00042957477853633463,
      "reward_std": 0.0008591495570726693,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.000477305322419852,
      "rewards/logprob_reward/std": 0.0019594368059188128,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 780.8125,
      "completions/mean_terminated_length": 712.719970703125,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 1.5216049382716048,
      "grad_norm": 1.274269025857144,
      "kl": 0.234130859375,
      "learning_rate": 4.081510587543784e-07,
      "loss": 0.0175,
      "num_tokens": 13657803.0,
      "reward": 0.007064519450068474,
      "reward_std": 0.012881873175501823,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0009050215012393892,
      "rewards/logprob_reward/std": 0.0036485553719103336,
      "step": 493
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 737.375,
      "completions/mean_terminated_length": 671.2307739257812,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 1.5246913580246915,
      "grad_norm": 0.7721523436134393,
      "kl": NaN,
      "learning_rate": 4.0776355538109285e-07,
      "loss": -0.0027,
      "num_tokens": 13687715.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 679.46875,
      "completions/mean_terminated_length": 615.6666870117188,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 1.5277777777777777,
      "grad_norm": 1.1190525113595386,
      "kl": 0.24267578125,
      "learning_rate": 4.073754211174123e-07,
      "loss": -0.0302,
      "num_tokens": 13716146.0,
      "reward": 0.0032679312862455845,
      "reward_std": 0.006535862572491169,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0001588123559486121,
      "rewards/logprob_reward/std": 0.0008983783191069961,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 718.5625,
      "completions/mean_terminated_length": 648.0769653320312,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 1.5308641975308643,
      "grad_norm": 0.011772250359833447,
      "kl": 0.235595703125,
      "learning_rate": 4.069866575154706e-07,
      "loss": 0.0002,
      "num_tokens": 13746456.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 748.75,
      "completions/mean_terminated_length": 685.2307739257812,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 1.5339506172839505,
      "grad_norm": 1.036641945037653,
      "kl": 0.2569580078125,
      "learning_rate": 4.0659726612991853e-07,
      "loss": 0.0029,
      "num_tokens": 13777220.0,
      "reward": 0.00014904368435963988,
      "reward_std": 0.00029808736871927977,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0001656040985835716,
      "rewards/logprob_reward/std": 0.0009367982274852693,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 775.65625,
      "completions/mean_terminated_length": 692.875,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 1.5370370370370372,
      "grad_norm": 2.040791241615359,
      "kl": 0.21435546875,
      "learning_rate": 4.062072485179172e-07,
      "loss": -0.0446,
      "num_tokens": 13808961.0,
      "reward": 0.0074306377209723,
      "reward_std": 0.013677163049578667,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00131182000041008,
      "rewards/logprob_reward/std": 0.004722681827843189,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 690.0,
      "completions/mean_terminated_length": 628.1481323242188,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 1.5401234567901234,
      "grad_norm": 0.8678643078533776,
      "kl": 0.20458984375,
      "learning_rate": 4.0581660623913216e-07,
      "loss": 0.004,
      "num_tokens": 13837493.0,
      "reward": 0.006410412490367889,
      "reward_std": 0.007036527618765831,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0001782359031494707,
      "rewards/logprob_reward/std": 0.0010082544758915901,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 714.0625,
      "completions/mean_terminated_length": 693.4000244140625,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 1.5432098765432098,
      "grad_norm": 0.007614484510444482,
      "kl": 0.2413330078125,
      "learning_rate": 4.0542534085572677e-07,
      "loss": 0.0002,
      "num_tokens": 13866455.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 751.84375,
      "completions/mean_terminated_length": 675.6400146484375,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 1.5462962962962963,
      "grad_norm": 1.5316793683826315,
      "kl": 0.2237548828125,
      "learning_rate": 4.050334539323563e-07,
      "loss": -0.0311,
      "num_tokens": 13897134.0,
      "reward": 0.003464690176770091,
      "reward_std": 0.006929380353540182,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00037743343273177743,
      "rewards/logprob_reward/std": 0.0015973311383277178,
      "step": 501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 726.75,
      "completions/mean_terminated_length": 684.2857666015625,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 1.5493827160493827,
      "grad_norm": 0.8030631221407978,
      "kl": 0.212890625,
      "learning_rate": 4.046409470361615e-07,
      "loss": -0.0132,
      "num_tokens": 13926998.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 748.375,
      "completions/mean_terminated_length": 697.3333129882812,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 1.5524691358024691,
      "grad_norm": 1.1411802118497676,
      "kl": 0.1939697265625,
      "learning_rate": 4.0424782173676235e-07,
      "loss": -0.0043,
      "num_tokens": 13957978.0,
      "reward": 0.003966109361499548,
      "reward_std": 0.007221629843115807,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0009345660218968987,
      "rewards/logprob_reward/std": 0.0036782834213227034,
      "step": 503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 754.375,
      "completions/mean_terminated_length": 692.1538696289062,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 1.5555555555555556,
      "grad_norm": 1.1392876503462597,
      "kl": 0.2095947265625,
      "learning_rate": 4.0385407960625185e-07,
      "loss": 0.0484,
      "num_tokens": 13988954.0,
      "reward": 0.0007476196624338627,
      "reward_std": 0.0014952393248677254,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0008306884556077421,
      "rewards/logprob_reward/std": 0.003281813580542803,
      "step": 504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 757.21875,
      "completions/mean_terminated_length": 719.107177734375,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 1.558641975308642,
      "grad_norm": 0.007288560957668381,
      "kl": 0.2373046875,
      "learning_rate": 4.034597222191896e-07,
      "loss": 0.0002,
      "num_tokens": 14020117.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 667.28125,
      "completions/mean_terminated_length": 616.3214721679688,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 1.5617283950617284,
      "grad_norm": 0.5935219450571073,
      "kl": 0.2216796875,
      "learning_rate": 4.030647511525956e-07,
      "loss": 0.0217,
      "num_tokens": 14047862.0,
      "reward": 0.00018285616533830762,
      "reward_std": 0.00036571233067661524,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00020317352027632296,
      "rewards/logprob_reward/std": 0.001149323070421815,
      "step": 506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 682.125,
      "completions/mean_terminated_length": 659.3333740234375,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 1.5648148148148149,
      "grad_norm": 1.258804726631916,
      "kl": 0.2191162109375,
      "learning_rate": 4.0266916798594417e-07,
      "loss": -0.0273,
      "num_tokens": 14076670.0,
      "reward": 0.00024014676455408335,
      "reward_std": 0.0004802935291081667,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00026682973839342594,
      "rewards/logprob_reward/std": 0.0012838775292038918,
      "step": 507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 726.0,
      "completions/mean_terminated_length": 695.1724243164062,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 1.567901234567901,
      "grad_norm": 0.6454728419319262,
      "kl": 0.21533203125,
      "learning_rate": 4.02272974301157e-07,
      "loss": 0.0095,
      "num_tokens": 14105914.0,
      "reward": 0.0001895047607831657,
      "reward_std": 0.0003790095215663314,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00021056084369774908,
      "rewards/logprob_reward/std": 0.0011911119800060987,
      "step": 508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 791.03125,
      "completions/mean_terminated_length": 737.269287109375,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 1.5709876543209877,
      "grad_norm": 1.2859936176358455,
      "kl": 0.197265625,
      "learning_rate": 4.018761716825974e-07,
      "loss": -0.0258,
      "num_tokens": 14138467.0,
      "reward": 0.00028381761512719095,
      "reward_std": 0.0005676352302543819,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00031535292509943247,
      "rewards/logprob_reward/std": 0.001241048565134406,
      "step": 509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 646.5,
      "completions/mean_terminated_length": 607.4483032226562,
      "completions/min_length": 306.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 1.574074074074074,
      "grad_norm": 0.008592360131948421,
      "kl": 0.2469482421875,
      "learning_rate": 4.014787617170639e-07,
      "loss": 0.0002,
      "num_tokens": 14165595.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 675.03125,
      "completions/mean_terminated_length": 638.9310302734375,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 1.5771604938271606,
      "grad_norm": 1.171419955274731,
      "kl": 0.1822509765625,
      "learning_rate": 4.010807459937836e-07,
      "loss": -0.0501,
      "num_tokens": 14194016.0,
      "reward": 0.003713682759553194,
      "reward_std": 0.006750314496457577,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0006540918839164078,
      "rewards/logprob_reward/std": 0.002199083101004362,
      "step": 511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 697.21875,
      "completions/mean_terminated_length": 621.8077392578125,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 1.5802469135802468,
      "grad_norm": 0.9975735423759305,
      "kl": 0.2100830078125,
      "learning_rate": 4.006821261044061e-07,
      "loss": 0.0032,
      "num_tokens": 14222627.0,
      "reward": 0.0062500000931322575,
      "reward_std": 0.012500000186264515,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 739.6875,
      "completions/mean_terminated_length": 720.7333984375,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 1.5833333333333335,
      "grad_norm": 1.0210658224486546,
      "kl": 0.2042236328125,
      "learning_rate": 4.002829036429971e-07,
      "loss": -0.0316,
      "num_tokens": 14253177.0,
      "reward": 0.0034115025773644447,
      "reward_std": 0.0068230051547288895,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0003183361841365695,
      "rewards/logprob_reward/std": 0.001800781348720193,
      "step": 513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 618.25,
      "completions/mean_terminated_length": 605.1612548828125,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 1.5864197530864197,
      "grad_norm": 1.5387658671321223,
      "kl": 0.2314453125,
      "learning_rate": 3.998830802060317e-07,
      "loss": -0.0561,
      "num_tokens": 14279533.0,
      "reward": 9.884743485599756e-05,
      "reward_std": 0.00019769486971199512,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00010983048559864983,
      "rewards/logprob_reward/std": 0.0004931351286359131,
      "step": 514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 653.0625,
      "completions/mean_terminated_length": 614.6896362304688,
      "completions/min_length": 240.0,
      "completions/min_terminated_length": 240.0,
      "epoch": 1.5895061728395061,
      "grad_norm": 0.6673077990520957,
      "kl": 0.228271484375,
      "learning_rate": 3.994826573923886e-07,
      "loss": 0.0146,
      "num_tokens": 14306735.0,
      "reward": 2.7812768166768365e-05,
      "reward_std": 5.562553633353673e-05,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.090307654929347e-05,
      "rewards/logprob_reward/std": 0.00017481419490650296,
      "step": 515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 638.40625,
      "completions/mean_terminated_length": 612.7000122070312,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 1.5925925925925926,
      "grad_norm": 0.7409619007185388,
      "kl": 0.21533203125,
      "learning_rate": 3.9908163680334326e-07,
      "loss": -0.0085,
      "num_tokens": 14333544.0,
      "reward": 0.006347795017063618,
      "reward_std": 0.007331542205065489,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00010866135562537238,
      "rewards/logprob_reward/std": 0.0006146814557723701,
      "step": 516
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1018.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 703.125,
      "completions/mean_terminated_length": 703.125,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 1.595679012345679,
      "grad_norm": 1.803806100339916,
      "kl": NaN,
      "learning_rate": 3.9868002004256165e-07,
      "loss": -0.0982,
      "num_tokens": 14362676.0,
      "reward": 0.0035705072805285454,
      "reward_std": 0.007141014561057091,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0004950080765411258,
      "rewards/logprob_reward/std": 0.0022015306167304516,
      "step": 517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 700.46875,
      "completions/mean_terminated_length": 667.0,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 1.5987654320987654,
      "grad_norm": 1.6329114031815135,
      "kl": 0.19091796875,
      "learning_rate": 3.982778087160935e-07,
      "loss": -0.0519,
      "num_tokens": 14391563.0,
      "reward": 0.0034556398168206215,
      "reward_std": 0.006911279633641243,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0003673774190247059,
      "rewards/logprob_reward/std": 0.0014456151984632015,
      "step": 518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 665.78125,
      "completions/mean_terminated_length": 628.72412109375,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 1.6018518518518519,
      "grad_norm": 0.628706653315792,
      "kl": 0.1845703125,
      "learning_rate": 3.9787500443236664e-07,
      "loss": 0.004,
      "num_tokens": 14419624.0,
      "reward": 1.9111619621980935e-05,
      "reward_std": 3.8223235605983064e-05,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 2.1235133317532018e-05,
      "rewards/logprob_reward/std": 0.00012012405932182446,
      "step": 519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 671.21875,
      "completions/mean_terminated_length": 647.7000122070312,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 1.6049382716049383,
      "grad_norm": 1.3226224371308348,
      "kl": 0.2242431640625,
      "learning_rate": 3.9747160880217994e-07,
      "loss": -0.0485,
      "num_tokens": 14447795.0,
      "reward": 0.003268610220402479,
      "reward_std": 0.006537220440804958,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00015956697461660951,
      "rewards/logprob_reward/std": 0.0009026471525430679,
      "step": 520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 711.84375,
      "completions/mean_terminated_length": 667.25,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.6080246913580247,
      "grad_norm": 1.403521045440835,
      "kl": 0.1754150390625,
      "learning_rate": 3.9706762343869705e-07,
      "loss": -0.0531,
      "num_tokens": 14477182.0,
      "reward": 0.0002255470462841913,
      "reward_std": 0.0004165461577940732,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0002506078453734517,
      "rewards/logprob_reward/std": 0.0010859397007152438,
      "step": 521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 693.28125,
      "completions/mean_terminated_length": 659.0689697265625,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 1.6111111111111112,
      "grad_norm": 1.297088064026116,
      "kl": 0.198486328125,
      "learning_rate": 3.966630499574397e-07,
      "loss": -0.0447,
      "num_tokens": 14505659.0,
      "reward": 0.003463702742010355,
      "reward_std": 0.006641572806984186,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0003763362765312195,
      "rewards/logprob_reward/std": 0.0014818207127973437,
      "step": 522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 674.4375,
      "completions/mean_terminated_length": 663.1612548828125,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 1.6141975308641974,
      "grad_norm": 1.2878515019824124,
      "kl": 0.197021484375,
      "learning_rate": 3.9625788997628196e-07,
      "loss": 0.0182,
      "num_tokens": 14533549.0,
      "reward": 0.0033389755990356207,
      "reward_std": 0.006677951198071241,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0002377507626079023,
      "rewards/logprob_reward/std": 0.00101320946123451,
      "step": 523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 915.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 619.375,
      "completions/mean_terminated_length": 619.375,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 1.617283950617284,
      "grad_norm": 1.5402558614689486,
      "kl": 0.2086181640625,
      "learning_rate": 3.958521451154428e-07,
      "loss": -0.0177,
      "num_tokens": 14559761.0,
      "reward": 0.006601343862712383,
      "reward_std": 0.012623758986592293,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00039038198883645236,
      "rewards/logprob_reward/std": 0.0012395030353218317,
      "step": 524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 719.625,
      "completions/mean_terminated_length": 676.1428833007812,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 1.6203703703703702,
      "grad_norm": 0.8735394195416519,
      "kl": 0.1729736328125,
      "learning_rate": 3.954458169974805e-07,
      "loss": -0.0163,
      "num_tokens": 14589565.0,
      "reward": 0.0002931773487944156,
      "reward_std": 0.0005863546975888312,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0003257526259403676,
      "rewards/logprob_reward/std": 0.0018427351024001837,
      "step": 525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 649.78125,
      "completions/mean_terminated_length": 624.8333740234375,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 1.623456790123457,
      "grad_norm": 2.12400021297932,
      "kl": 0.2386474609375,
      "learning_rate": 3.950389072472855e-07,
      "loss": -0.1252,
      "num_tokens": 14616710.0,
      "reward": 0.009645880199968815,
      "reward_std": 0.014008638449013233,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.00030097743729129434,
      "rewards/logprob_reward/std": 0.001185261644423008,
      "step": 526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 669.53125,
      "completions/mean_terminated_length": 632.862060546875,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 1.626543209876543,
      "grad_norm": 1.2793150513425062,
      "kl": 0.183837890625,
      "learning_rate": 3.9463141749207425e-07,
      "loss": -0.0152,
      "num_tokens": 14644815.0,
      "reward": 0.0002448577433824539,
      "reward_std": 0.0004897154867649078,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0002720641787163913,
      "rewards/logprob_reward/std": 0.001189305679872632,
      "step": 527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 629.28125,
      "completions/mean_terminated_length": 616.54833984375,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 1.6296296296296298,
      "grad_norm": 2.7478992262055466,
      "kl": 0.254150390625,
      "learning_rate": 3.9422334936138255e-07,
      "loss": -0.1333,
      "num_tokens": 14671220.0,
      "reward": 0.0062837013974785805,
      "reward_std": 0.012540229596197605,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 3.744592686416581e-05,
      "rewards/logprob_reward/std": 0.0001500230428064242,
      "step": 528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 686.3125,
      "completions/mean_terminated_length": 651.3793334960938,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 1.632716049382716,
      "grad_norm": 0.7965683182840033,
      "kl": 0.251708984375,
      "learning_rate": 3.938147044870594e-07,
      "loss": 0.0158,
      "num_tokens": 14699818.0,
      "reward": 9.006389882415533e-05,
      "reward_std": 0.00018012779764831066,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00010007100354414433,
      "rewards/logprob_reward/std": 0.0005295322043821216,
      "step": 529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 765.0625,
      "completions/mean_terminated_length": 692.5599975585938,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 1.6358024691358026,
      "grad_norm": 2.232671746705429,
      "kl": 0.2332763671875,
      "learning_rate": 3.934054845032598e-07,
      "loss": -0.0223,
      "num_tokens": 14731156.0,
      "reward": 0.0006742465193383396,
      "reward_std": 0.0013484930386766791,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0007491627475246787,
      "rewards/logprob_reward/std": 0.002688066568225622,
      "step": 530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 676.75,
      "completions/mean_terminated_length": 665.54833984375,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 1.6388888888888888,
      "grad_norm": 1.2362784051265552,
      "kl": 0.1810302734375,
      "learning_rate": 3.9299569104643876e-07,
      "loss": -0.0151,
      "num_tokens": 14759360.0,
      "reward": 0.003136072074994445,
      "reward_std": 0.00627214414998889,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 1.2302157301746774e-05,
      "rewards/logprob_reward/std": 6.959151505725458e-05,
      "step": 531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 714.46875,
      "completions/mean_terminated_length": 657.1481323242188,
      "completions/min_length": 317.0,
      "completions/min_terminated_length": 317.0,
      "epoch": 1.6419753086419753,
      "grad_norm": 0.01923557030109291,
      "kl": 0.2412109375,
      "learning_rate": 3.925853257553445e-07,
      "loss": 0.0002,
      "num_tokens": 14788351.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 663.0,
      "completions/mean_terminated_length": 638.933349609375,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 1.6450617283950617,
      "grad_norm": 0.7118540255991272,
      "kl": 0.24749755859375,
      "learning_rate": 3.921743902710122e-07,
      "loss": 0.0148,
      "num_tokens": 14816111.0,
      "reward": 7.841860497137532e-05,
      "reward_std": 0.00015683720994275063,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 8.71317824930884e-05,
      "rewards/logprob_reward/std": 0.0004928918206132948,
      "step": 533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 676.6875,
      "completions/mean_terminated_length": 653.5333862304688,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 1.6481481481481481,
      "grad_norm": 1.9074909191744749,
      "kl": 0.23583984375,
      "learning_rate": 3.917628862367569e-07,
      "loss": -0.0625,
      "num_tokens": 14844345.0,
      "reward": 0.0003346746671013534,
      "reward_std": 0.0006693493342027068,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0003718607476912439,
      "rewards/logprob_reward/std": 0.0017217351123690605,
      "step": 534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 746.03125,
      "completions/mean_terminated_length": 668.2000122070312,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 1.6512345679012346,
      "grad_norm": 1.345902352064227,
      "kl": 0.364990234375,
      "learning_rate": 3.913508152981674e-07,
      "loss": 0.0156,
      "num_tokens": 14875014.0,
      "reward": 0.00032999191898852587,
      "reward_std": 0.0006599838379770517,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00036665768129751086,
      "rewards/logprob_reward/std": 0.0013613435439765453,
      "step": 535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 632.40625,
      "completions/mean_terminated_length": 591.8965454101562,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 1.654320987654321,
      "grad_norm": 1.6643347435632014,
      "kl": 0.248046875,
      "learning_rate": 3.909381791030998e-07,
      "loss": -0.0584,
      "num_tokens": 14901283.0,
      "reward": 0.0005056440131738782,
      "reward_std": 0.0010112880263477564,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0005618267459794879,
      "rewards/logprob_reward/std": 0.002438273513689637,
      "step": 536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 737.8125,
      "completions/mean_terminated_length": 684.8148193359375,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 1.6574074074074074,
      "grad_norm": 1.1208025492877656,
      "kl": 0.2255859375,
      "learning_rate": 3.905249793016702e-07,
      "loss": -0.0176,
      "num_tokens": 14931865.0,
      "reward": 0.0033837691880762577,
      "reward_std": 0.006767538376152515,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0002875212230719626,
      "rewards/logprob_reward/std": 0.001626465586014092,
      "step": 537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 689.3125,
      "completions/mean_terminated_length": 641.5,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 1.6604938271604939,
      "grad_norm": 0.012149488981183494,
      "kl": 0.2154541015625,
      "learning_rate": 3.9011121754624865e-07,
      "loss": 0.0002,
      "num_tokens": 14960259.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 717.8125,
      "completions/mean_terminated_length": 697.4000244140625,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 1.6635802469135803,
      "grad_norm": 1.6654338514291054,
      "kl": 0.201171875,
      "learning_rate": 3.8969689549145266e-07,
      "loss": -0.0166,
      "num_tokens": 14989973.0,
      "reward": 0.0038445612881332636,
      "reward_std": 0.00677096052095294,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0007995126652531326,
      "rewards/logprob_reward/std": 0.0022769954521209,
      "step": 539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 727.25,
      "completions/mean_terminated_length": 644.1599731445312,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 1.6666666666666665,
      "grad_norm": 0.025271756498403648,
      "kl": 0.2305908203125,
      "learning_rate": 3.8928201479414024e-07,
      "loss": 0.0002,
      "num_tokens": 15019449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 682.75,
      "completions/mean_terminated_length": 647.4483032226562,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 1.6697530864197532,
      "grad_norm": 0.0343700147662808,
      "kl": 0.259033203125,
      "learning_rate": 3.888665771134032e-07,
      "loss": 0.0003,
      "num_tokens": 15047285.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 632.8125,
      "completions/mean_terminated_length": 620.1935424804688,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 1.6728395061728394,
      "grad_norm": 1.0155951920968442,
      "kl": 0.2408447265625,
      "learning_rate": 3.8845058411056095e-07,
      "loss": -0.0351,
      "num_tokens": 15073459.0,
      "reward": 0.003251735121011734,
      "reward_std": 0.006403619423508644,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.000140816715429537,
      "rewards/logprob_reward/std": 0.0005689717945642769,
      "step": 542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 716.4375,
      "completions/mean_terminated_length": 659.4815063476562,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 1.675925925925926,
      "grad_norm": 1.0392347068243504,
      "kl": 0.205322265625,
      "learning_rate": 3.880340374491535e-07,
      "loss": -0.0343,
      "num_tokens": 15103001.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 636.03125,
      "completions/mean_terminated_length": 610.1666870117188,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 1.6790123456790123,
      "grad_norm": 1.3288486748243753,
      "kl": 0.240234375,
      "learning_rate": 3.8761693879493495e-07,
      "loss": -0.0364,
      "num_tokens": 15129690.0,
      "reward": 0.003559510223567486,
      "reward_std": 0.006802430842071772,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00048278900794684887,
      "rewards/logprob_reward/std": 0.0016537087503820658,
      "step": 544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 671.0625,
      "completions/mean_terminated_length": 647.5333862304688,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 1.682098765432099,
      "grad_norm": 1.3301409124638641,
      "kl": 0.201416015625,
      "learning_rate": 3.871992898158667e-07,
      "loss": -0.0243,
      "num_tokens": 15157504.0,
      "reward": 0.009390555322170258,
      "reward_std": 0.013497989624738693,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 1.7283886336372234e-05,
      "rewards/logprob_reward/std": 9.777242667041719e-05,
      "step": 545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 779.375,
      "completions/mean_terminated_length": 722.923095703125,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 1.6851851851851851,
      "grad_norm": 1.8399131374109736,
      "kl": 0.236083984375,
      "learning_rate": 3.867810921821112e-07,
      "loss": -0.0683,
      "num_tokens": 15189700.0,
      "reward": 0.0002412402827758342,
      "reward_std": 0.0004824805655516684,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00026804476510733366,
      "rewards/logprob_reward/std": 0.0015162901254370809,
      "step": 546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 663.625,
      "completions/mean_terminated_length": 639.6000366210938,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 1.6882716049382716,
      "grad_norm": 1.6899053313155517,
      "kl": 0.2037353515625,
      "learning_rate": 3.863623475660245e-07,
      "loss": -0.1045,
      "num_tokens": 15217380.0,
      "reward": 0.006546499207615852,
      "reward_std": 0.013092998415231705,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0003294435446150601,
      "rewards/logprob_reward/std": 0.0016374721890315413,
      "step": 547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 728.0,
      "completions/mean_terminated_length": 697.3793334960938,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 1.691358024691358,
      "grad_norm": 1.1925377947779334,
      "kl": 0.230224609375,
      "learning_rate": 3.859430576421503e-07,
      "loss": 0.0307,
      "num_tokens": 15247136.0,
      "reward": 0.0033502508886158466,
      "reward_std": 0.006618572399020195,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0002502789138816297,
      "rewards/logprob_reward/std": 0.0008935660007409751,
      "step": 548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 686.46875,
      "completions/mean_terminated_length": 623.9629516601562,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 1.6944444444444444,
      "grad_norm": 1.8250365220303777,
      "kl": 0.200927734375,
      "learning_rate": 3.855232240872128e-07,
      "loss": -0.0448,
      "num_tokens": 15275503.0,
      "reward": 0.0033973990939557552,
      "reward_std": 0.0067947981879115105,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00030266543035395443,
      "rewards/logprob_reward/std": 0.001362743554636836,
      "step": 549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 659.1875,
      "completions/mean_terminated_length": 621.4483032226562,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 1.6975308641975309,
      "grad_norm": 1.2164846800422262,
      "kl": 0.24462890625,
      "learning_rate": 3.851028485801105e-07,
      "loss": -0.0359,
      "num_tokens": 15303321.0,
      "reward": 0.0005772442673332989,
      "reward_std": 0.0007730101933702826,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0006413825321942568,
      "rewards/logprob_reward/std": 0.002183767966926098,
      "step": 550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 659.90625,
      "completions/mean_terminated_length": 622.2413940429688,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 1.7006172839506173,
      "grad_norm": 1.64498439566592,
      "kl": 0.2210693359375,
      "learning_rate": 3.8468193280190864e-07,
      "loss": -0.0603,
      "num_tokens": 15330974.0,
      "reward": 0.00019263531430624425,
      "reward_std": 0.0003852706286124885,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00021403926075436175,
      "rewards/logprob_reward/std": 0.0008833868196234107,
      "step": 551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 651.25,
      "completions/mean_terminated_length": 639.2257690429688,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 1.7037037037037037,
      "grad_norm": 0.8204484969903891,
      "kl": 0.2283935546875,
      "learning_rate": 3.842604784358333e-07,
      "loss": -0.0067,
      "num_tokens": 15357954.0,
      "reward": 0.00011239574087085202,
      "reward_std": 0.00022479148174170405,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00012488415814004838,
      "rewards/logprob_reward/std": 0.0007064514793455601,
      "step": 552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 730.4375,
      "completions/mean_terminated_length": 710.86669921875,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 1.7067901234567902,
      "grad_norm": 0.6324076683656258,
      "kl": 0.1995849609375,
      "learning_rate": 3.8383848716726444e-07,
      "loss": 0.0143,
      "num_tokens": 15388404.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 856.0,
      "completions/mean_length": 598.65625,
      "completions/mean_terminated_length": 570.300048828125,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 1.7098765432098766,
      "grad_norm": 1.3854096979699027,
      "kl": 0.243408203125,
      "learning_rate": 3.8341596068372874e-07,
      "loss": -0.0223,
      "num_tokens": 15413525.0,
      "reward": 0.0031891183461993933,
      "reward_std": 0.0063782366923987865,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 7.124279363779351e-05,
      "rewards/logprob_reward/std": 0.0003295870847068727,
      "step": 554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 641.375,
      "completions/mean_terminated_length": 586.7142944335938,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 1.7129629629629628,
      "grad_norm": 1.5483551432739786,
      "kl": 0.274658203125,
      "learning_rate": 3.829929006748934e-07,
      "loss": -0.0428,
      "num_tokens": 15440121.0,
      "reward": 0.0006713579641655087,
      "reward_std": 0.0013427160447463393,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0007459533517248929,
      "rewards/logprob_reward/std": 0.0038864153902977705,
      "step": 555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 741.15625,
      "completions/mean_terminated_length": 711.8965454101562,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 1.7160493827160495,
      "grad_norm": 1.8155255721411654,
      "kl": 0.1968994140625,
      "learning_rate": 3.8256930883255927e-07,
      "loss": -0.0472,
      "num_tokens": 15470438.0,
      "reward": 0.0034768604673445225,
      "reward_std": 0.006953720934689045,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00039095626561902463,
      "rewards/logprob_reward/std": 0.001544651691801846,
      "step": 556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 886.0,
      "completions/mean_length": 685.96875,
      "completions/mean_terminated_length": 651.0,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 1.7191358024691357,
      "grad_norm": 0.6701429724195487,
      "kl": 0.204833984375,
      "learning_rate": 3.8214518685065377e-07,
      "loss": 0.001,
      "num_tokens": 15499145.0,
      "reward": 0.00042355037294328213,
      "reward_std": 0.0004890738055109978,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00047061152872629464,
      "rewards/logprob_reward/std": 0.0018518351716920733,
      "step": 557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 646.34375,
      "completions/mean_terminated_length": 621.1666870117188,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 1.7222222222222223,
      "grad_norm": 1.698009864594641,
      "kl": 0.225830078125,
      "learning_rate": 3.817205364252244e-07,
      "loss": 0.0202,
      "num_tokens": 15525912.0,
      "reward": 0.001057287328876555,
      "reward_std": 0.0018129103118553758,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0011747637763619423,
      "rewards/logprob_reward/std": 0.003242099191993475,
      "step": 558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 712.84375,
      "completions/mean_terminated_length": 692.1000366210938,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 1.7253086419753085,
      "grad_norm": 0.8978241670939364,
      "kl": 0.1929931640625,
      "learning_rate": 3.8129535925443187e-07,
      "loss": -0.0231,
      "num_tokens": 15555443.0,
      "reward": 0.0001963062968570739,
      "reward_std": 0.00023446467821486294,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00021811810438521206,
      "rewards/logprob_reward/std": 0.000874139426741749,
      "step": 559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 683.28125,
      "completions/mean_terminated_length": 660.5667114257812,
      "completions/min_length": 333.0,
      "completions/min_terminated_length": 333.0,
      "epoch": 1.7283950617283952,
      "grad_norm": 0.9053528909515298,
      "kl": 0.215576171875,
      "learning_rate": 3.8086965703854336e-07,
      "loss": -0.0109,
      "num_tokens": 15583916.0,
      "reward": 0.00016514095477759838,
      "reward_std": 0.00033028190955519676,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0001834899594541639,
      "rewards/logprob_reward/std": 0.0010379758896306157,
      "step": 560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 860.0,
      "completions/mean_length": 648.59375,
      "completions/mean_terminated_length": 609.7586059570312,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 1.7314814814814814,
      "grad_norm": 1.3833182735515157,
      "kl": 0.2352294921875,
      "learning_rate": 3.8044343147992563e-07,
      "loss": 0.0218,
      "num_tokens": 15611067.0,
      "reward": 0.0003771684132516384,
      "reward_std": 0.0005551050999201834,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00041907603736035526,
      "rewards/logprob_reward/std": 0.0012183680664747953,
      "step": 561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 676.40625,
      "completions/mean_terminated_length": 653.2333374023438,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 1.734567901234568,
      "grad_norm": 2.663795700852014,
      "kl": 0.221923828125,
      "learning_rate": 3.8001668428303847e-07,
      "loss": -0.0667,
      "num_tokens": 15639632.0,
      "reward": 0.0038018166087567806,
      "reward_std": 0.007603633217513561,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0007520184153690934,
      "rewards/logprob_reward/std": 0.0032845232635736465,
      "step": 562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 673.59375,
      "completions/mean_terminated_length": 637.3448486328125,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 1.7376543209876543,
      "grad_norm": 1.1606607471316734,
      "kl": 0.21630859375,
      "learning_rate": 3.7958941715442726e-07,
      "loss": -0.0115,
      "num_tokens": 15667271.0,
      "reward": 0.0031921612098813057,
      "reward_std": 0.006344469729810953,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 7.462350185960531e-05,
      "rewards/logprob_reward/std": 0.0003383457660675049,
      "step": 563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 681.6875,
      "completions/mean_terminated_length": 632.7857666015625,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 1.7407407407407407,
      "grad_norm": 1.2183370306869523,
      "kl": 0.2139892578125,
      "learning_rate": 3.791616318027171e-07,
      "loss": -0.0217,
      "num_tokens": 15695405.0,
      "reward": 0.003422896610572934,
      "reward_std": 0.006645845249295235,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.000330996117554605,
      "rewards/logprob_reward/std": 0.0014105957234278321,
      "step": 564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 787.0,
      "completions/mean_length": 669.34375,
      "completions/mean_terminated_length": 603.6666870117188,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 1.7438271604938271,
      "grad_norm": 0.6382976844465137,
      "kl": 0.2100830078125,
      "learning_rate": 3.78733329938605e-07,
      "loss": 0.0197,
      "num_tokens": 15723200.0,
      "reward": 6.218590715434402e-05,
      "reward_std": 0.00012437181430868804,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 6.909545481903479e-05,
      "rewards/logprob_reward/std": 0.00039086290053091943,
      "step": 565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 710.375,
      "completions/mean_terminated_length": 677.9310302734375,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 1.7469135802469136,
      "grad_norm": 1.8427554754446938,
      "kl": 0.21417236328125,
      "learning_rate": 3.7830451327485367e-07,
      "loss": -0.1368,
      "num_tokens": 15752992.0,
      "reward": 0.00027952424716204405,
      "reward_std": 0.0005590484943240881,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0003105824871454388,
      "rewards/logprob_reward/std": 0.001312681590206921,
      "step": 566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 990.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 648.25,
      "completions/mean_terminated_length": 648.25,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 1.75,
      "grad_norm": 0.03047510652804787,
      "kl": 0.240234375,
      "learning_rate": 3.778751835262847e-07,
      "loss": 0.0002,
      "num_tokens": 15779968.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 638.6875,
      "completions/mean_terminated_length": 613.0000610351562,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 1.7530864197530864,
      "grad_norm": 0.9325899409839425,
      "kl": 0.206787109375,
      "learning_rate": 3.7744534240977085e-07,
      "loss": -0.0031,
      "num_tokens": 15806774.0,
      "reward": 0.00016520038479939103,
      "reward_std": 0.00033040076959878206,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00018355599604547024,
      "rewards/logprob_reward/std": 0.0010383494663983583,
      "step": 568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 637.25,
      "completions/mean_terminated_length": 597.2413940429688,
      "completions/min_length": 340.0,
      "completions/min_terminated_length": 340.0,
      "epoch": 1.7561728395061729,
      "grad_norm": 2.1549187244707677,
      "kl": 0.234619140625,
      "learning_rate": 3.7701499164423045e-07,
      "loss": -0.0691,
      "num_tokens": 15833482.0,
      "reward": 0.0003800169506575912,
      "reward_std": 0.0007600339013151824,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0004222410498186946,
      "rewards/logprob_reward/std": 0.0018106489442288876,
      "step": 569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 742.8125,
      "completions/mean_terminated_length": 702.6428833007812,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 1.7592592592592593,
      "grad_norm": 1.121914628219875,
      "kl": 0.22802734375,
      "learning_rate": 3.7658413295061974e-07,
      "loss": 0.0065,
      "num_tokens": 15864092.0,
      "reward": 0.00044991367030888796,
      "reward_std": 0.0008998273406177759,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0004999040393158793,
      "rewards/logprob_reward/std": 0.0017304662615060806,
      "step": 570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 734.53125,
      "completions/mean_terminated_length": 653.47998046875,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 1.7623456790123457,
      "grad_norm": 1.339499848247988,
      "kl": 0.229248046875,
      "learning_rate": 3.7615276805192595e-07,
      "loss": -0.0622,
      "num_tokens": 15894121.0,
      "reward": 0.006375543307512999,
      "reward_std": 0.012420847080647945,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0001394923747284338,
      "rewards/logprob_reward/std": 0.0007890879642218351,
      "step": 571
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 691.8125,
      "completions/mean_terminated_length": 630.2963256835938,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 1.765432098765432,
      "grad_norm": 1.3147532536862638,
      "kl": NaN,
      "learning_rate": 3.7572089867316075e-07,
      "loss": -0.0712,
      "num_tokens": 15923103.0,
      "reward": 0.003216506913304329,
      "reward_std": 0.006433013826608658,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00010167431173613295,
      "rewards/logprob_reward/std": 0.0005751567659899592,
      "step": 572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 624.78125,
      "completions/mean_terminated_length": 567.75,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 1.7685185185185186,
      "grad_norm": 2.9182706994037972,
      "kl": 0.2005615234375,
      "learning_rate": 3.7528852654135323e-07,
      "loss": -0.3139,
      "num_tokens": 15949304.0,
      "reward": 0.0016523003578186035,
      "reward_std": 0.00254919589497149,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0018358894158154726,
      "rewards/logprob_reward/std": 0.004228347912430763,
      "step": 573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 692.96875,
      "completions/mean_terminated_length": 658.72412109375,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 1.7716049382716048,
      "grad_norm": 3.1566661593032346,
      "kl": 0.2393798828125,
      "learning_rate": 3.7485565338554294e-07,
      "loss": -0.1953,
      "num_tokens": 15977843.0,
      "reward": 0.000254699494689703,
      "reward_std": 0.000509398989379406,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0002829994191415608,
      "rewards/logprob_reward/std": 0.000908093003090471,
      "step": 574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 713.90625,
      "completions/mean_terminated_length": 703.9031982421875,
      "completions/min_length": 498.0,
      "completions/min_terminated_length": 498.0,
      "epoch": 1.7746913580246915,
      "grad_norm": 1.4487019782934505,
      "kl": 0.21484375,
      "learning_rate": 3.7442228093677296e-07,
      "loss": -0.0497,
      "num_tokens": 16006856.0,
      "reward": 0.0034210658632218838,
      "reward_std": 0.006678018253296614,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00032896202174015343,
      "rewards/logprob_reward/std": 0.001040315837599337,
      "step": 575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 683.0625,
      "completions/mean_terminated_length": 634.357177734375,
      "completions/min_length": 355.0,
      "completions/min_terminated_length": 355.0,
      "epoch": 1.7777777777777777,
      "grad_norm": 1.7400070642658096,
      "kl": 0.245361328125,
      "learning_rate": 3.7398841092808307e-07,
      "loss": -0.1239,
      "num_tokens": 16035294.0,
      "reward": 0.001132089295424521,
      "reward_std": 0.002264178590849042,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0012578770983964205,
      "rewards/logprob_reward/std": 0.005214401055127382,
      "step": 576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 620.71875,
      "completions/mean_terminated_length": 579.0,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 1.7808641975308643,
      "grad_norm": 1.882047078975824,
      "kl": 0.2557373046875,
      "learning_rate": 3.735540450945028e-07,
      "loss": 0.0118,
      "num_tokens": 16061417.0,
      "reward": 0.003746317932382226,
      "reward_std": 0.007492635864764452,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.000690353219397366,
      "rewards/logprob_reward/std": 0.002425077138468623,
      "step": 577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 675.40625,
      "completions/mean_terminated_length": 625.607177734375,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 1.7839506172839505,
      "grad_norm": 1.5233515915669114,
      "kl": 0.21844482421875,
      "learning_rate": 3.731191851730443e-07,
      "loss": -0.1325,
      "num_tokens": 16089662.0,
      "reward": 0.007472705096006393,
      "reward_std": 0.012153670191764832,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0013585611013695598,
      "rewards/logprob_reward/std": 0.007685181684792042,
      "step": 578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 666.53125,
      "completions/mean_terminated_length": 655.0,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 1.7870370370370372,
      "grad_norm": 1.9879078602215337,
      "kl": 0.202392578125,
      "learning_rate": 3.7268383290269583e-07,
      "loss": -0.0217,
      "num_tokens": 16117055.0,
      "reward": 0.0004778489819727838,
      "reward_std": 0.0009556979639455676,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0005309433327056468,
      "rewards/logprob_reward/std": 0.001705501927062869,
      "step": 579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 752.625,
      "completions/mean_terminated_length": 676.6400146484375,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 1.7901234567901234,
      "grad_norm": 1.6217444321309262,
      "kl": 0.2469482421875,
      "learning_rate": 3.7224799002441427e-07,
      "loss": -0.0398,
      "num_tokens": 16148175.0,
      "reward": 0.00041415169835090637,
      "reward_std": 0.0008283034549094737,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00046016855048947036,
      "rewards/logprob_reward/std": 0.0026031064335256815,
      "step": 580
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 834.0,
      "completions/mean_length": 687.84375,
      "completions/mean_terminated_length": 639.8214721679688,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 1.7932098765432098,
      "grad_norm": 1.3112878299707869,
      "kl": NaN,
      "learning_rate": 3.718116582811186e-07,
      "loss": -0.0825,
      "num_tokens": 16176942.0,
      "reward": 0.0003379134286660701,
      "reward_std": 0.0006758268573321402,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0003754593781195581,
      "rewards/logprob_reward/std": 0.0014890246093273163,
      "step": 581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 697.9375,
      "completions/mean_terminated_length": 651.357177734375,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 1.7962962962962963,
      "grad_norm": 0.749183739087493,
      "kl": 0.2171630859375,
      "learning_rate": 3.713748394176827e-07,
      "loss": -0.013,
      "num_tokens": 16205860.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 639.4375,
      "completions/mean_terminated_length": 627.0322265625,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 1.7993827160493827,
      "grad_norm": 2.1244408546901385,
      "kl": 0.21630859375,
      "learning_rate": 3.7093753518092853e-07,
      "loss": -0.112,
      "num_tokens": 16232626.0,
      "reward": 0.0002797901979647577,
      "reward_std": 0.0005595803959295154,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0003108780365437269,
      "rewards/logprob_reward/std": 0.0009934562258422375,
      "step": 583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 652.375,
      "completions/mean_terminated_length": 599.2857666015625,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 1.8024691358024691,
      "grad_norm": 2.0480921821475335,
      "kl": 0.20709228515625,
      "learning_rate": 3.704997473196187e-07,
      "loss": -0.0799,
      "num_tokens": 16259786.0,
      "reward": 0.0033423788845539093,
      "reward_std": 0.006684757769107819,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00024153194681275636,
      "rewards/logprob_reward/std": 0.0008057018858380616,
      "step": 584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 659.5625,
      "completions/mean_terminated_length": 607.5,
      "completions/min_length": 301.0,
      "completions/min_terminated_length": 301.0,
      "epoch": 1.8055555555555556,
      "grad_norm": 2.7833744522159534,
      "kl": 0.2177734375,
      "learning_rate": 3.7006147758445017e-07,
      "loss": -0.27,
      "num_tokens": 16287652.0,
      "reward": 0.0006784686120226979,
      "reward_std": 0.0012488170759752393,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0007538540521636605,
      "rewards/logprob_reward/std": 0.002179505070671439,
      "step": 585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 636.6875,
      "completions/mean_terminated_length": 624.1935424804688,
      "completions/min_length": 301.0,
      "completions/min_terminated_length": 301.0,
      "epoch": 1.808641975308642,
      "grad_norm": 1.0955419928063221,
      "kl": 0.207275390625,
      "learning_rate": 3.696227277280467e-07,
      "loss": -0.0446,
      "num_tokens": 16314314.0,
      "reward": 0.0032903477549552917,
      "reward_std": 0.0065806955099105835,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00018371966143604368,
      "rewards/logprob_reward/std": 0.001039275317452848,
      "step": 586
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 755.03125,
      "completions/mean_terminated_length": 716.607177734375,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 1.8117283950617284,
      "grad_norm": 1.1770169484689004,
      "kl": NaN,
      "learning_rate": 3.691834995049522e-07,
      "loss": -0.0274,
      "num_tokens": 16345195.0,
      "reward": 0.00325536890886724,
      "reward_std": 0.00651073781773448,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0001448543625883758,
      "rewards/logprob_reward/std": 0.0008194200927391648,
      "step": 587
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 737.71875,
      "completions/mean_terminated_length": 684.7037353515625,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 1.8148148148148149,
      "grad_norm": 0.5399996364713595,
      "kl": NaN,
      "learning_rate": 3.687437946716234e-07,
      "loss": 0.0159,
      "num_tokens": 16375446.0,
      "reward": 5.639005757984705e-05,
      "reward_std": 0.0001127801151596941,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 6.265562115004286e-05,
      "rewards/logprob_reward/std": 0.00035443369415588677,
      "step": 588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 811.53125,
      "completions/mean_terminated_length": 728.3912963867188,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 1.817901234567901,
      "grad_norm": 0.5676428595234785,
      "kl": 0.2337646484375,
      "learning_rate": 3.68303614986423e-07,
      "loss": 0.0251,
      "num_tokens": 16408123.0,
      "reward": 0.00019499726477079093,
      "reward_std": 0.00038999452954158187,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0002166636404581368,
      "rewards/logprob_reward/std": 0.0012256345944479108,
      "step": 589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 719.65625,
      "completions/mean_terminated_length": 688.1724243164062,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 1.8209876543209877,
      "grad_norm": 1.594060644099542,
      "kl": 0.215576171875,
      "learning_rate": 3.6786296220961277e-07,
      "loss": -0.0736,
      "num_tokens": 16437728.0,
      "reward": 0.003276342060416937,
      "reward_std": 0.006552684120833874,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00016815779963508248,
      "rewards/logprob_reward/std": 0.000704619218595326,
      "step": 590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 737.15625,
      "completions/mean_terminated_length": 696.1785888671875,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 1.824074074074074,
      "grad_norm": 0.6973092693761138,
      "kl": 0.2528076171875,
      "learning_rate": 3.6742183810334605e-07,
      "loss": 0.0096,
      "num_tokens": 16468301.0,
      "reward": 0.0031250000465661287,
      "reward_std": 0.0062500000931322575,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 701.125,
      "completions/mean_terminated_length": 667.72412109375,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 1.8271604938271606,
      "grad_norm": 0.6128440332453214,
      "kl": 0.214599609375,
      "learning_rate": 3.6698024443166134e-07,
      "loss": 0.0046,
      "num_tokens": 16497181.0,
      "reward": 0.00035004859091714025,
      "reward_std": 0.00040432787500321865,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0003889428626280278,
      "rewards/logprob_reward/std": 0.0015307284193113446,
      "step": 592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 987.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 663.84375,
      "completions/mean_terminated_length": 663.84375,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 1.8302469135802468,
      "grad_norm": 1.7003587122824284,
      "kl": 0.2296142578125,
      "learning_rate": 3.6653818296047466e-07,
      "loss": -0.0474,
      "num_tokens": 16525244.0,
      "reward": 0.003290090709924698,
      "reward_std": 0.006580181419849396,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00018343421106692404,
      "rewards/logprob_reward/std": 0.0008672911208122969,
      "step": 593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 632.09375,
      "completions/mean_terminated_length": 605.9666748046875,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 1.8333333333333335,
      "grad_norm": 0.8095433239735778,
      "kl": 0.2178955078125,
      "learning_rate": 3.660956554575729e-07,
      "loss": -0.0387,
      "num_tokens": 16552051.0,
      "reward": 0.00011567265028133988,
      "reward_std": 0.00013375480193644762,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0001285251637455076,
      "rewards/logprob_reward/std": 0.0005061195697635412,
      "step": 594
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 698.96875,
      "completions/mean_terminated_length": 638.7777709960938,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 1.8364197530864197,
      "grad_norm": 1.189685264859625,
      "kl": NaN,
      "learning_rate": 3.656526636926065e-07,
      "loss": -0.0132,
      "num_tokens": 16581354.0,
      "reward": 0.0004485278914216906,
      "reward_std": 0.0008970557828433812,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0004983643302693963,
      "rewards/logprob_reward/std": 0.0022451134864240885,
      "step": 595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 643.625,
      "completions/mean_terminated_length": 631.3547973632812,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 1.8395061728395061,
      "grad_norm": 2.291384447871791,
      "kl": 0.2108154296875,
      "learning_rate": 3.652092094370826e-07,
      "loss": -0.188,
      "num_tokens": 16608210.0,
      "reward": 0.007215001620352268,
      "reward_std": 0.013606452383100986,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0010722236474975944,
      "rewards/logprob_reward/std": 0.0025399161968380213,
      "step": 596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 620.03125,
      "completions/mean_terminated_length": 607.0,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 1.8425925925925926,
      "grad_norm": 1.7033004523047413,
      "kl": 0.247802734375,
      "learning_rate": 3.647652944643577e-07,
      "loss": -0.1362,
      "num_tokens": 16634491.0,
      "reward": 0.008209185674786568,
      "reward_std": 0.016418371349573135,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00564909540116787,
      "rewards/logprob_reward/std": 0.031682878732681274,
      "step": 597
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 663.8125,
      "completions/mean_terminated_length": 639.800048828125,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 1.845679012345679,
      "grad_norm": 1.7798577344049977,
      "kl": NaN,
      "learning_rate": 3.6432092054963055e-07,
      "loss": -0.0714,
      "num_tokens": 16662137.0,
      "reward": 0.00012272670574020594,
      "reward_std": 0.0002454534114804119,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00013636301446240395,
      "rewards/logprob_reward/std": 0.0005504813161678612,
      "step": 598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 669.3125,
      "completions/mean_terminated_length": 645.6666870117188,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 1.8487654320987654,
      "grad_norm": 3.8329879003761573,
      "kl": 0.2119140625,
      "learning_rate": 3.638760894699355e-07,
      "loss": -0.2267,
      "num_tokens": 16690151.0,
      "reward": 0.003814605064690113,
      "reward_std": 0.007213447708636522,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0007662278367206454,
      "rewards/logprob_reward/std": 0.0023001504596322775,
      "step": 599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 666.6875,
      "completions/mean_terminated_length": 615.6428833007812,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.8518518518518519,
      "grad_norm": 1.9690954257948825,
      "kl": 0.189453125,
      "learning_rate": 3.6343080300413497e-07,
      "loss": -0.1472,
      "num_tokens": 16718129.0,
      "reward": 0.0006136884912848473,
      "reward_std": 0.0012273769825696945,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0006818760884925723,
      "rewards/logprob_reward/std": 0.0021795539651066065,
      "step": 600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 955.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 616.03125,
      "completions/mean_terminated_length": 616.03125,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 1.8549382716049383,
      "grad_norm": 2.267634044761325,
      "kl": 0.20660400390625,
      "learning_rate": 3.629850629329124e-07,
      "loss": -0.1886,
      "num_tokens": 16743918.0,
      "reward": 0.0002721587661653757,
      "reward_std": 0.0005443175323307514,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00030239863554015756,
      "rewards/logprob_reward/std": 0.0009632536093704402,
      "step": 601
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 637.1875,
      "completions/mean_terminated_length": 611.4000244140625,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "epoch": 1.8580246913580247,
      "grad_norm": 1.9216710596841131,
      "kl": NaN,
      "learning_rate": 3.625388710387651e-07,
      "loss": -0.117,
      "num_tokens": 16770708.0,
      "reward": 0.00043982313945889473,
      "reward_std": 0.0008796462789177895,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0004886924289166927,
      "rewards/logprob_reward/std": 0.0018993121339008212,
      "step": 602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 636.625,
      "completions/mean_terminated_length": 610.800048828125,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "epoch": 1.8611111111111112,
      "grad_norm": 0.7693513544724933,
      "kl": 0.2528076171875,
      "learning_rate": 3.6209222910599746e-07,
      "loss": 0.0026,
      "num_tokens": 16797624.0,
      "reward": 0.00011575274402275681,
      "reward_std": 0.00023150548804551363,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0001286141632590443,
      "rewards/logprob_reward/std": 0.0007275515235960484,
      "step": 603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 706.5,
      "completions/mean_terminated_length": 685.3333740234375,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 1.8641975308641974,
      "grad_norm": 2.756857318491456,
      "kl": 0.1982421875,
      "learning_rate": 3.616451389207133e-07,
      "loss": -0.3005,
      "num_tokens": 16826608.0,
      "reward": 0.009740946814417839,
      "reward_std": 0.019144851714372635,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.00040660687955096364,
      "rewards/logprob_reward/std": 0.0009685659315437078,
      "step": 604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 669.65625,
      "completions/mean_terminated_length": 646.0333862304688,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 1.867283950617284,
      "grad_norm": 2.4007858570678065,
      "kl": 0.19091796875,
      "learning_rate": 3.611976022708091e-07,
      "loss": -0.1266,
      "num_tokens": 16854421.0,
      "reward": 0.0005778549239039421,
      "reward_std": 0.0011557098478078842,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0006420610588975251,
      "rewards/logprob_reward/std": 0.0019600477535277605,
      "step": 605
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 714.4375,
      "completions/mean_terminated_length": 682.413818359375,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 1.8703703703703702,
      "grad_norm": 1.0836992968485804,
      "kl": NaN,
      "learning_rate": 3.6074962094596676e-07,
      "loss": -0.0144,
      "num_tokens": 16884259.0,
      "reward": 0.0002586792397778481,
      "reward_std": 0.0005173584795556962,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.000287421396933496,
      "rewards/logprob_reward/std": 0.0013689196202903986,
      "step": 606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 638.9375,
      "completions/mean_terminated_length": 626.51611328125,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 1.873456790123457,
      "grad_norm": 1.9117851121082363,
      "kl": 0.271728515625,
      "learning_rate": 3.603011967376464e-07,
      "loss": -0.0971,
      "num_tokens": 16910953.0,
      "reward": 0.0063897306099534035,
      "reward_std": 0.012779461219906807,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00015525644994340837,
      "rewards/logprob_reward/std": 0.0006112787523306906,
      "step": 607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 717.1875,
      "completions/mean_terminated_length": 660.370361328125,
      "completions/min_length": 340.0,
      "completions/min_terminated_length": 340.0,
      "epoch": 1.876543209876543,
      "grad_norm": 2.934403376079828,
      "kl": 0.221923828125,
      "learning_rate": 3.598523314390792e-07,
      "loss": -0.2261,
      "num_tokens": 16940703.0,
      "reward": 0.003488546935841441,
      "reward_std": 0.006977093871682882,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00040394088136963546,
      "rewards/logprob_reward/std": 0.001497879740782082,
      "step": 608
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 656.46875,
      "completions/mean_terminated_length": 618.4483032226562,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 1.8796296296296298,
      "grad_norm": 1.5286841883570508,
      "kl": NaN,
      "learning_rate": 3.594030268452601e-07,
      "loss": -0.1172,
      "num_tokens": 16968390.0,
      "reward": 0.007147197145968676,
      "reward_std": 0.012378408573567867,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0009968855883926153,
      "rewards/logprob_reward/std": 0.002718651667237282,
      "step": 609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 733.59375,
      "completions/mean_terminated_length": 703.5516967773438,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 1.882716049382716,
      "grad_norm": 0.8153433208659888,
      "kl": 0.245361328125,
      "learning_rate": 3.5895328475294106e-07,
      "loss": 0.0039,
      "num_tokens": 16998853.0,
      "reward": 7.518004713347182e-05,
      "reward_std": 0.00015036009426694363,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 8.353338489541784e-05,
      "rewards/logprob_reward/std": 0.00047253616503439844,
      "step": 610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 881.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 595.40625,
      "completions/mean_terminated_length": 595.40625,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 1.8858024691358026,
      "grad_norm": 1.3981606907089654,
      "kl": 0.2476806640625,
      "learning_rate": 3.585031069606234e-07,
      "loss": -0.0461,
      "num_tokens": 17024022.0,
      "reward": 0.00025799532886594534,
      "reward_std": 0.0005159906577318907,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0002866614959202707,
      "rewards/logprob_reward/std": 0.0010666087036952376,
      "step": 611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 733.21875,
      "completions/mean_terminated_length": 703.137939453125,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 1.8888888888888888,
      "grad_norm": 1.9300403485331896,
      "kl": 0.24560546875,
      "learning_rate": 3.5805249526855074e-07,
      "loss": -0.1364,
      "num_tokens": 17054241.0,
      "reward": 0.007239177823066711,
      "reward_std": 0.013713184744119644,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0010990869486704469,
      "rewards/logprob_reward/std": 0.003983082249760628,
      "step": 612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 725.78125,
      "completions/mean_terminated_length": 694.9310302734375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.8919753086419753,
      "grad_norm": 1.614546467864275,
      "kl": 0.2275390625,
      "learning_rate": 3.5760145147870204e-07,
      "loss": -0.0186,
      "num_tokens": 17084394.0,
      "reward": 0.004577491898089647,
      "reward_std": 0.005401932634413242,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.005086102522909641,
      "rewards/logprob_reward/std": 0.01939297839999199,
      "step": 613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 649.34375,
      "completions/mean_terminated_length": 637.258056640625,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 1.8950617283950617,
      "grad_norm": 1.9682660385141868,
      "kl": 0.229736328125,
      "learning_rate": 3.571499773947839e-07,
      "loss": -0.1667,
      "num_tokens": 17111605.0,
      "reward": 0.003681524656713009,
      "reward_std": 0.007349275518208742,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0006183606456033885,
      "rewards/logprob_reward/std": 0.0019619932863861322,
      "step": 614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 663.3125,
      "completions/mean_terminated_length": 651.6774291992188,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 1.8981481481481481,
      "grad_norm": 0.01012425679665426,
      "kl": 0.263916015625,
      "learning_rate": 3.5669807482222395e-07,
      "loss": 0.0003,
      "num_tokens": 17139279.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 615
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 947.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 640.65625,
      "completions/mean_terminated_length": 640.65625,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 1.9012345679012346,
      "grad_norm": 0.006418792746033701,
      "kl": NaN,
      "learning_rate": 3.562457455681633e-07,
      "loss": 0.0002,
      "num_tokens": 17165992.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 718.3125,
      "completions/mean_terminated_length": 708.4515991210938,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 1.904320987654321,
      "grad_norm": 1.4639889924948153,
      "kl": 0.25054931640625,
      "learning_rate": 3.557929914414491e-07,
      "loss": -0.0644,
      "num_tokens": 17195434.0,
      "reward": 0.0037563112564384937,
      "reward_std": 0.006750315893441439,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0007014567381702363,
      "rewards/logprob_reward/std": 0.0023105114232748747,
      "step": 617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 902.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 660.65625,
      "completions/mean_terminated_length": 660.65625,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 1.9074074074074074,
      "grad_norm": 2.7532084091241784,
      "kl": 0.2330322265625,
      "learning_rate": 3.553398142526277e-07,
      "loss": -0.2638,
      "num_tokens": 17222695.0,
      "reward": 0.004605669528245926,
      "reward_std": 0.00885448046028614,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0016451881965622306,
      "rewards/logprob_reward/std": 0.005964573472738266,
      "step": 618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 635.28125,
      "completions/mean_terminated_length": 622.741943359375,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 1.9104938271604939,
      "grad_norm": 1.4866930666560245,
      "kl": 0.2518310546875,
      "learning_rate": 3.5488621581393736e-07,
      "loss": -0.0629,
      "num_tokens": 17249528.0,
      "reward": 0.003804399399086833,
      "reward_std": 0.007557778153568506,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.000754888285882771,
      "rewards/logprob_reward/std": 0.003386253025382757,
      "step": 619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 677.40625,
      "completions/mean_terminated_length": 654.300048828125,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 1.9135802469135803,
      "grad_norm": 1.6192875703341125,
      "kl": 0.219482421875,
      "learning_rate": 3.5443219793930073e-07,
      "loss": -0.0757,
      "num_tokens": 17277369.0,
      "reward": 0.006622261367738247,
      "reward_std": 0.012823529541492462,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00041362352203577757,
      "rewards/logprob_reward/std": 0.0013766667107120156,
      "step": 620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 720.34375,
      "completions/mean_terminated_length": 700.1000366210938,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 1.9166666666666665,
      "grad_norm": 1.361747719158989,
      "kl": 0.2054443359375,
      "learning_rate": 3.5397776244431794e-07,
      "loss": 0.0265,
      "num_tokens": 17306976.0,
      "reward": 0.000303977431030944,
      "reward_std": 0.000607954862061888,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.000337752717314288,
      "rewards/logprob_reward/std": 0.0011352337896823883,
      "step": 621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 888.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 582.59375,
      "completions/mean_terminated_length": 582.59375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 1.9197530864197532,
      "grad_norm": 1.888818286955456,
      "kl": 0.2421875,
      "learning_rate": 3.535229111462589e-07,
      "loss": -0.0519,
      "num_tokens": 17331479.0,
      "reward": 0.009957034140825272,
      "reward_std": 0.01969798468053341,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0006467049242928624,
      "rewards/logprob_reward/std": 0.0022434887941926718,
      "step": 622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 742.5625,
      "completions/mean_terminated_length": 690.4444580078125,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 1.9228395061728394,
      "grad_norm": 1.8183940594845527,
      "kl": 0.1805419921875,
      "learning_rate": 3.530676458640567e-07,
      "loss": -0.104,
      "num_tokens": 17361937.0,
      "reward": 0.006928172893822193,
      "reward_std": 0.01311812736093998,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0007535253535024822,
      "rewards/logprob_reward/std": 0.0017174641834571958,
      "step": 623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 682.34375,
      "completions/mean_terminated_length": 659.5667114257812,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 1.925925925925926,
      "grad_norm": 1.622669129379335,
      "kl": 0.2322998046875,
      "learning_rate": 3.5261196841829957e-07,
      "loss": -0.1164,
      "num_tokens": 17390448.0,
      "reward": 0.0008956977399066091,
      "reward_std": 0.0017913954798132181,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0009952196851372719,
      "rewards/logprob_reward/std": 0.004388981498777866,
      "step": 624
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 817.0,
      "completions/mean_length": 653.6875,
      "completions/mean_terminated_length": 585.1111450195312,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 1.9290123456790123,
      "grad_norm": 1.9691449530904876,
      "kl": NaN,
      "learning_rate": 3.521558806312241e-07,
      "loss": -0.0801,
      "num_tokens": 17417398.0,
      "reward": 0.00996247585862875,
      "reward_std": 0.019323039799928665,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0006527507212013006,
      "rewards/logprob_reward/std": 0.0014504214050248265,
      "step": 625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 782.9375,
      "completions/mean_terminated_length": 738.2963256835938,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 1.932098765432099,
      "grad_norm": 1.7416981798694386,
      "kl": 0.2022705078125,
      "learning_rate": 3.5169938432670775e-07,
      "loss": -0.1037,
      "num_tokens": 17449732.0,
      "reward": 0.009690329432487488,
      "reward_std": 0.018926044926047325,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0003503664047457278,
      "rewards/logprob_reward/std": 0.0011261178879067302,
      "step": 626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 956.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 635.9375,
      "completions/mean_terminated_length": 635.9375,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 1.9351851851851851,
      "grad_norm": 2.551253548549046,
      "kl": 0.227294921875,
      "learning_rate": 3.5124248133026187e-07,
      "loss": -0.1819,
      "num_tokens": 17476330.0,
      "reward": 0.013743954710662365,
      "reward_std": 0.020080571994185448,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0013821718748658895,
      "rewards/logprob_reward/std": 0.003602338256314397,
      "step": 627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 702.8125,
      "completions/mean_terminated_length": 692.4515991210938,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 1.9382716049382716,
      "grad_norm": 2.049453800691809,
      "kl": 0.153564453125,
      "learning_rate": 3.5078517346902384e-07,
      "loss": -0.1668,
      "num_tokens": 17505348.0,
      "reward": 0.0038290387019515038,
      "reward_std": 0.007396456319838762,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0007822653278708458,
      "rewards/logprob_reward/std": 0.001585601014085114,
      "step": 628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 742.5,
      "completions/mean_terminated_length": 702.2857666015625,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 1.941358024691358,
      "grad_norm": 1.7754893581187987,
      "kl": 0.224609375,
      "learning_rate": 3.503274625717504e-07,
      "loss": -0.0476,
      "num_tokens": 17535780.0,
      "reward": 0.016202464699745178,
      "reward_std": 0.026726093143224716,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0006416262476705015,
      "rewards/logprob_reward/std": 0.001685622613877058,
      "step": 629
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 688.625,
      "completions/mean_terminated_length": 640.7142944335938,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 1.9444444444444444,
      "grad_norm": 1.0366987696053815,
      "kl": NaN,
      "learning_rate": 3.498693504688097e-07,
      "loss": -0.053,
      "num_tokens": 17564420.0,
      "reward": 0.00011124689626740292,
      "reward_std": 0.00022249379253480583,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00012360766413621604,
      "rewards/logprob_reward/std": 0.0005967440083622932,
      "step": 630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 711.09375,
      "completions/mean_terminated_length": 666.3928833007812,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 1.9475308641975309,
      "grad_norm": 1.9263338689613325,
      "kl": 0.2337646484375,
      "learning_rate": 3.494108389921744e-07,
      "loss": -0.1325,
      "num_tokens": 17593547.0,
      "reward": 0.006682299077510834,
      "reward_std": 0.01326083205640316,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0004803319461643696,
      "rewards/logprob_reward/std": 0.0015430138446390629,
      "step": 631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 961.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 622.0,
      "completions/mean_terminated_length": 622.0,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 1.9506172839506173,
      "grad_norm": 1.1144940461560273,
      "kl": 0.2257080078125,
      "learning_rate": 3.4895192997541436e-07,
      "loss": -0.0614,
      "num_tokens": 17619683.0,
      "reward": 0.00013048779510427266,
      "reward_std": 0.00026097559020854533,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0001449864503229037,
      "rewards/logprob_reward/std": 0.0008201671880669892,
      "step": 632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 979.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 657.375,
      "completions/mean_terminated_length": 657.375,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 1.9537037037037037,
      "grad_norm": 2.7741367405573274,
      "kl": 0.20556640625,
      "learning_rate": 3.484926252536891e-07,
      "loss": -0.3178,
      "num_tokens": 17646827.0,
      "reward": 0.006848607212305069,
      "reward_std": 0.013298070058226585,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0006651186849921942,
      "rewards/logprob_reward/std": 0.001522500766441226,
      "step": 633
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 776.34375,
      "completions/mean_terminated_length": 707.0,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 1.9567901234567902,
      "grad_norm": 1.2660058001746237,
      "kl": NaN,
      "learning_rate": 3.4803292666374047e-07,
      "loss": -0.0422,
      "num_tokens": 17678622.0,
      "reward": 0.0034396664705127478,
      "reward_std": 0.006518971174955368,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00034962897188961506,
      "rewards/logprob_reward/std": 0.0009765062131918967,
      "step": 634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 975.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 620.5,
      "completions/mean_terminated_length": 620.5,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 1.9598765432098766,
      "grad_norm": 2.3222121088229892,
      "kl": 0.20465087890625,
      "learning_rate": 3.4757283604388546e-07,
      "loss": -0.3146,
      "num_tokens": 17704882.0,
      "reward": 0.010148421861231327,
      "reward_std": 0.014225448481738567,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0008593578240834177,
      "rewards/logprob_reward/std": 0.002076053060591221,
      "step": 635
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 644.96875,
      "completions/mean_terminated_length": 632.741943359375,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.9629629629629628,
      "grad_norm": 2.857289780048274,
      "kl": NaN,
      "learning_rate": 3.47112355234009e-07,
      "loss": -0.2991,
      "num_tokens": 17731801.0,
      "reward": 0.00415524048730731,
      "reward_std": 0.007971592247486115,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0011447113938629627,
      "rewards/logprob_reward/std": 0.0035985566210001707,
      "step": 636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 689.40625,
      "completions/mean_terminated_length": 654.7930908203125,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 1.9660493827160495,
      "grad_norm": 2.6457668450419636,
      "kl": 0.2266845703125,
      "learning_rate": 3.466514860755559e-07,
      "loss": -0.2339,
      "num_tokens": 17760614.0,
      "reward": 0.003990027587860823,
      "reward_std": 0.0076071722432971,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0009611418936401606,
      "rewards/logprob_reward/std": 0.0026789382100105286,
      "step": 637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 748.0625,
      "completions/mean_terminated_length": 708.6428833007812,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.9691358024691357,
      "grad_norm": 3.398117563461742,
      "kl": 0.2039794921875,
      "learning_rate": 3.4619023041152433e-07,
      "loss": -0.3676,
      "num_tokens": 17791488.0,
      "reward": 0.009694833308458328,
      "reward_std": 0.01901283487677574,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0003553707501851022,
      "rewards/logprob_reward/std": 0.0011662804754450917,
      "step": 638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 666.6875,
      "completions/mean_terminated_length": 655.1612548828125,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 1.9722222222222223,
      "grad_norm": 3.980949939208548,
      "kl": 0.1934814453125,
      "learning_rate": 3.4572859008645796e-07,
      "loss": -0.7152,
      "num_tokens": 17819310.0,
      "reward": 0.01366504654288292,
      "reward_std": 0.021684300154447556,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0012944953050464392,
      "rewards/logprob_reward/std": 0.003360053990036249,
      "step": 639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 642.875,
      "completions/mean_terminated_length": 630.5806274414062,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 1.9753086419753085,
      "grad_norm": 2.097082282872992,
      "kl": 0.2086181640625,
      "learning_rate": 3.452665669464386e-07,
      "loss": -0.1114,
      "num_tokens": 17846250.0,
      "reward": 0.012979520484805107,
      "reward_std": 0.020176339894533157,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0005327996914274991,
      "rewards/logprob_reward/std": 0.0012611752608790994,
      "step": 640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 678.375,
      "completions/mean_terminated_length": 655.3333740234375,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 1.9783950617283952,
      "grad_norm": 2.0341866997312157,
      "kl": 0.20025634765625,
      "learning_rate": 3.448041628390791e-07,
      "loss": -0.1804,
      "num_tokens": 17874178.0,
      "reward": 0.010370473377406597,
      "reward_std": 0.014955338090658188,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0011060814140364528,
      "rewards/logprob_reward/std": 0.002451796317473054,
      "step": 641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 773.40625,
      "completions/mean_terminated_length": 703.239990234375,
      "completions/min_length": 511.0,
      "completions/min_terminated_length": 511.0,
      "epoch": 1.9814814814814814,
      "grad_norm": 1.768869897826344,
      "kl": 0.195556640625,
      "learning_rate": 3.443413796135159e-07,
      "loss": -0.1654,
      "num_tokens": 17906123.0,
      "reward": 0.006963692139834166,
      "reward_std": 0.013696135021746159,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0007929910207167268,
      "rewards/logprob_reward/std": 0.002285647438839078,
      "step": 642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 865.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 657.125,
      "completions/mean_terminated_length": 657.125,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 1.984567901234568,
      "grad_norm": 1.7234238536710782,
      "kl": 0.1939697265625,
      "learning_rate": 3.4387821912040116e-07,
      "loss": -0.144,
      "num_tokens": 17933367.0,
      "reward": 0.01052926853299141,
      "reward_std": 0.014431968331336975,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0012825197773054242,
      "rewards/logprob_reward/std": 0.0020001446828246117,
      "step": 643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 730.65625,
      "completions/mean_terminated_length": 711.1000366210938,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 1.9876543209876543,
      "grad_norm": 2.919212565274168,
      "kl": 0.19921875,
      "learning_rate": 3.4341468321189574e-07,
      "loss": -0.2925,
      "num_tokens": 17963196.0,
      "reward": 0.004549227189272642,
      "reward_std": 0.008575081825256348,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0015824747970327735,
      "rewards/logprob_reward/std": 0.00603429926559329,
      "step": 644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 949.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 672.34375,
      "completions/mean_terminated_length": 672.34375,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 1.9907407407407407,
      "grad_norm": 1.714941870799897,
      "kl": 0.1932373046875,
      "learning_rate": 3.4295077374166214e-07,
      "loss": -0.0636,
      "num_tokens": 17990939.0,
      "reward": 0.014002135023474693,
      "reward_std": 0.013756480067968369,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0016690394841134548,
      "rewards/logprob_reward/std": 0.003402333240956068,
      "step": 645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 977.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 648.5625,
      "completions/mean_terminated_length": 648.5625,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 1.9938271604938271,
      "grad_norm": 1.9059464070252758,
      "kl": 0.2001953125,
      "learning_rate": 3.4248649256485655e-07,
      "loss": -0.1257,
      "num_tokens": 18017729.0,
      "reward": 0.004415285307914019,
      "reward_std": 0.008087873458862305,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0014336502645164728,
      "rewards/logprob_reward/std": 0.004082860425114632,
      "step": 646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 957.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 720.90625,
      "completions/mean_terminated_length": 720.90625,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 1.9969135802469136,
      "grad_norm": 2.861117188565715,
      "kl": 0.18402099609375,
      "learning_rate": 3.4202184153812135e-07,
      "loss": -0.2984,
      "num_tokens": 18047642.0,
      "reward": 0.011068264953792095,
      "reward_std": 0.02071787789463997,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0018814050126820803,
      "rewards/logprob_reward/std": 0.0049293856136500835,
      "step": 647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 701.65625,
      "completions/mean_terminated_length": 668.3103637695312,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 2.0,
      "grad_norm": 3.4446847588007503,
      "kl": 0.183349609375,
      "learning_rate": 3.415568225195783e-07,
      "loss": -0.3439,
      "num_tokens": 18076455.0,
      "reward": 0.013659604825079441,
      "reward_std": 0.020921621471643448,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0012884484604001045,
      "rewards/logprob_reward/std": 0.002464776625856757,
      "step": 648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 859.0,
      "completions/max_terminated_length": 859.0,
      "completions/mean_length": 639.34375,
      "completions/mean_terminated_length": 639.34375,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 2.003086419753086,
      "grad_norm": 2.8367048053235058,
      "kl": 0.2159423828125,
      "learning_rate": 3.410914373688205e-07,
      "loss": -0.2831,
      "num_tokens": 18103130.0,
      "reward": 0.010716721415519714,
      "reward_std": 0.020568618550896645,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.00149080203846097,
      "rewards/logprob_reward/std": 0.0031364334281533957,
      "step": 649
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 737.59375,
      "completions/mean_terminated_length": 696.6785888671875,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 2.006172839506173,
      "grad_norm": 1.7818935639622824,
      "kl": NaN,
      "learning_rate": 3.4062568794690536e-07,
      "loss": -0.0913,
      "num_tokens": 18133505.0,
      "reward": 0.008726210333406925,
      "reward_std": 0.014377452433109283,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.002751345979049802,
      "rewards/logprob_reward/std": 0.008913630619645119,
      "step": 650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 787.0,
      "completions/mean_terminated_length": 708.0,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 2.009259259259259,
      "grad_norm": 2.5376291886189506,
      "kl": 0.2034912109375,
      "learning_rate": 3.401595761163468e-07,
      "loss": -0.2945,
      "num_tokens": 18165753.0,
      "reward": 0.0041171349585056305,
      "reward_std": 0.007853616960346699,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0011023720726370811,
      "rewards/logprob_reward/std": 0.0028409750666469336,
      "step": 651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 687.03125,
      "completions/mean_terminated_length": 638.8928833007812,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 2.0123456790123457,
      "grad_norm": 2.165864505710753,
      "kl": 0.217041015625,
      "learning_rate": 3.3969310374110817e-07,
      "loss": -0.128,
      "num_tokens": 18194070.0,
      "reward": 0.011410648003220558,
      "reward_std": 0.015654362738132477,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0022618311922997236,
      "rewards/logprob_reward/std": 0.005047931801527739,
      "step": 652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 770.875,
      "completions/mean_terminated_length": 724.0,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 2.015432098765432,
      "grad_norm": 3.129542329363188,
      "kl": 0.206787109375,
      "learning_rate": 3.3922627268659467e-07,
      "loss": -0.3592,
      "num_tokens": 18225798.0,
      "reward": 0.006686346139758825,
      "reward_std": 0.013143929652869701,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00048482915735803545,
      "rewards/logprob_reward/std": 0.0012821360724046826,
      "step": 653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 700.375,
      "completions/mean_terminated_length": 678.800048828125,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 2.0185185185185186,
      "grad_norm": 1.7025150053825844,
      "kl": 0.1939697265625,
      "learning_rate": 3.387590848196456e-07,
      "loss": -0.075,
      "num_tokens": 18254870.0,
      "reward": 0.020044365897774696,
      "reward_std": 0.0279097743332386,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0014381844084709883,
      "rewards/logprob_reward/std": 0.002929187845438719,
      "step": 654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 745.75,
      "completions/mean_terminated_length": 716.9655151367188,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 2.021604938271605,
      "grad_norm": 1.3789003528592825,
      "kl": 0.1744384765625,
      "learning_rate": 3.382915420085274e-07,
      "loss": -0.0817,
      "num_tokens": 18285474.0,
      "reward": 0.00401701033115387,
      "reward_std": 0.007092323154211044,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0009911225643008947,
      "rewards/logprob_reward/std": 0.001909551676362753,
      "step": 655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 891.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 646.59375,
      "completions/mean_terminated_length": 646.59375,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 2.0246913580246915,
      "grad_norm": 1.4971325155816617,
      "kl": 0.1964111328125,
      "learning_rate": 3.3782364612292574e-07,
      "loss": -0.0537,
      "num_tokens": 18312153.0,
      "reward": 0.0006674743490293622,
      "reward_std": 0.0008368261624127626,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0007416382431983948,
      "rewards/logprob_reward/std": 0.0015969941159710288,
      "step": 656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 714.59375,
      "completions/mean_terminated_length": 682.586181640625,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 2.0277777777777777,
      "grad_norm": 2.1720589107666544,
      "kl": 0.2088623046875,
      "learning_rate": 3.3735539903393826e-07,
      "loss": -0.0993,
      "num_tokens": 18341308.0,
      "reward": 0.01398205291479826,
      "reward_std": 0.020772812888026237,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0016467259265482426,
      "rewards/logprob_reward/std": 0.0027846088632941246,
      "step": 657
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 719.25,
      "completions/mean_terminated_length": 709.4193115234375,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 2.0308641975308643,
      "grad_norm": 1.333437859197381,
      "kl": NaN,
      "learning_rate": 3.368868026140672e-07,
      "loss": 0.0051,
      "num_tokens": 18370472.0,
      "reward": 0.00979865062981844,
      "reward_std": 0.014177567325532436,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0004707224143203348,
      "rewards/logprob_reward/std": 0.001750817522406578,
      "step": 658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 699.03125,
      "completions/mean_terminated_length": 677.36669921875,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 2.0339506172839505,
      "grad_norm": 2.1036568152035935,
      "kl": 0.1922607421875,
      "learning_rate": 3.364178587372115e-07,
      "loss": -0.188,
      "num_tokens": 18399049.0,
      "reward": 0.004447158891707659,
      "reward_std": 0.00822368636727333,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0014690652024000883,
      "rewards/logprob_reward/std": 0.0029987546149641275,
      "step": 659
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 731.15625,
      "completions/mean_terminated_length": 663.5769653320312,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 2.037037037037037,
      "grad_norm": 1.9055595221485488,
      "kl": NaN,
      "learning_rate": 3.359485692786597e-07,
      "loss": -0.144,
      "num_tokens": 18428938.0,
      "reward": 0.006459952797740698,
      "reward_std": 0.012794826179742813,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00023328073439188302,
      "rewards/logprob_reward/std": 0.0006398450350388885,
      "step": 660
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 708.5625,
      "completions/mean_terminated_length": 663.5,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 2.0401234567901234,
      "grad_norm": 1.266916597384057,
      "kl": NaN,
      "learning_rate": 3.354789361150824e-07,
      "loss": -0.0315,
      "num_tokens": 18457924.0,
      "reward": 0.0071504078805446625,
      "reward_std": 0.013596047647297382,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0010004526702687144,
      "rewards/logprob_reward/std": 0.002534077037125826,
      "step": 661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 701.4375,
      "completions/mean_terminated_length": 679.933349609375,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 2.04320987654321,
      "grad_norm": 1.9044580996992468,
      "kl": 0.1959228515625,
      "learning_rate": 3.350089611245246e-07,
      "loss": -0.185,
      "num_tokens": 18486814.0,
      "reward": 0.00792771764099598,
      "reward_std": 0.009091407991945744,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0018641313072293997,
      "rewards/logprob_reward/std": 0.0034033788833767176,
      "step": 662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 776.4375,
      "completions/mean_terminated_length": 750.8275756835938,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 2.0462962962962963,
      "grad_norm": 1.8380414945608772,
      "kl": 0.17919921875,
      "learning_rate": 3.345386461863981e-07,
      "loss": -0.0741,
      "num_tokens": 18518560.0,
      "reward": 0.018665527924895287,
      "reward_std": 0.00801034551113844,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0033783656544983387,
      "rewards/logprob_reward/std": 0.005880396813154221,
      "step": 663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 967.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 708.0625,
      "completions/mean_terminated_length": 708.0625,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 2.049382716049383,
      "grad_norm": 1.9473130330652877,
      "kl": 0.1395263671875,
      "learning_rate": 3.340679931814743e-07,
      "loss": -0.2052,
      "num_tokens": 18547734.0,
      "reward": 0.016261916607618332,
      "reward_std": 0.02662317454814911,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0007076855399645865,
      "rewards/logprob_reward/std": 0.0016589416190981865,
      "step": 664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 886.0,
      "completions/mean_length": 736.71875,
      "completions/mean_terminated_length": 695.6785888671875,
      "completions/min_length": 580.0,
      "completions/min_terminated_length": 580.0,
      "epoch": 2.052469135802469,
      "grad_norm": 1.703841123020719,
      "kl": 0.189453125,
      "learning_rate": 3.3359700399187654e-07,
      "loss": -0.1388,
      "num_tokens": 18577937.0,
      "reward": 0.004150180146098137,
      "reward_std": 0.007443716283887625,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.001139089115895331,
      "rewards/logprob_reward/std": 0.003203223692253232,
      "step": 665
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 985.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 698.125,
      "completions/mean_terminated_length": 698.125,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 2.0555555555555554,
      "grad_norm": 2.475517824742073,
      "kl": NaN,
      "learning_rate": 3.331256805010724e-07,
      "loss": -0.2655,
      "num_tokens": 18606221.0,
      "reward": 0.007154985796660185,
      "reward_std": 0.013995552435517311,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0010055399034172297,
      "rewards/logprob_reward/std": 0.0031242729164659977,
      "step": 666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 683.28125,
      "completions/mean_terminated_length": 672.290283203125,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 2.058641975308642,
      "grad_norm": 1.6183520507006008,
      "kl": 0.1864013671875,
      "learning_rate": 3.326540245938666e-07,
      "loss": -0.074,
      "num_tokens": 18634210.0,
      "reward": 0.0033837261144071817,
      "reward_std": 0.006697164848446846,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0002874734054785222,
      "rewards/logprob_reward/std": 0.0007958361529745162,
      "step": 667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 656.65625,
      "completions/mean_terminated_length": 618.6551513671875,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 2.0617283950617282,
      "grad_norm": 2.0183462162596433,
      "kl": 0.1925048828125,
      "learning_rate": 3.3218203815639265e-07,
      "loss": -0.1317,
      "num_tokens": 18661287.0,
      "reward": 0.009772385470569134,
      "reward_std": 0.019369445741176605,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0004415393341332674,
      "rewards/logprob_reward/std": 0.0011397881899029016,
      "step": 668
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 805.0625,
      "completions/mean_terminated_length": 743.760009765625,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 2.064814814814815,
      "grad_norm": 1.7423276364718858,
      "kl": NaN,
      "learning_rate": 3.3170972307610654e-07,
      "loss": -0.0686,
      "num_tokens": 18694153.0,
      "reward": 0.006426104810088873,
      "reward_std": 0.012474000453948975,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.00019567189156077802,
      "rewards/logprob_reward/std": 0.0009213192970491946,
      "step": 669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 728.625,
      "completions/mean_terminated_length": 719.0967407226562,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 2.067901234567901,
      "grad_norm": 1.6042362855286607,
      "kl": 0.182373046875,
      "learning_rate": 3.312370812417779e-07,
      "loss": -0.0821,
      "num_tokens": 18723577.0,
      "reward": 0.0014206302585080266,
      "reward_std": 0.0014337702887132764,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.001578478142619133,
      "rewards/logprob_reward/std": 0.0031967894174158573,
      "step": 670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 716.59375,
      "completions/mean_terminated_length": 706.6773681640625,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 2.0709876543209877,
      "grad_norm": 2.557187308566611,
      "kl": 0.1739501953125,
      "learning_rate": 3.3076411454348336e-07,
      "loss": -0.1886,
      "num_tokens": 18752736.0,
      "reward": 0.011115819215774536,
      "reward_std": 0.0204685777425766,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0019342433661222458,
      "rewards/logprob_reward/std": 0.0029235610272735357,
      "step": 671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 759.03125,
      "completions/mean_terminated_length": 721.1785888671875,
      "completions/min_length": 525.0,
      "completions/min_terminated_length": 525.0,
      "epoch": 2.074074074074074,
      "grad_norm": 3.7069889032155583,
      "kl": 0.1595458984375,
      "learning_rate": 3.3029082487259847e-07,
      "loss": -0.3376,
      "num_tokens": 18783641.0,
      "reward": 0.00557498587295413,
      "reward_std": 0.01114997174590826,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0027222069911658764,
      "rewards/logprob_reward/std": 0.009254386648535728,
      "step": 672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 683.3125,
      "completions/mean_terminated_length": 660.6000366210938,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 2.0771604938271606,
      "grad_norm": 2.615035737173366,
      "kl": 0.21624755859375,
      "learning_rate": 3.298172141217905e-07,
      "loss": -0.3121,
      "num_tokens": 18811987.0,
      "reward": 0.007663974072784185,
      "reward_std": 0.01448277197778225,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0015710823936387897,
      "rewards/logprob_reward/std": 0.0026468480937182903,
      "step": 673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 815.0,
      "completions/max_terminated_length": 815.0,
      "completions/mean_length": 608.21875,
      "completions/mean_terminated_length": 608.21875,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 2.080246913580247,
      "grad_norm": 3.429308325454037,
      "kl": 0.1944580078125,
      "learning_rate": 3.2934328418501064e-07,
      "loss": -0.427,
      "num_tokens": 18837502.0,
      "reward": 0.01980498433113098,
      "reward_std": 0.03812722861766815,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0011722053168341517,
      "rewards/logprob_reward/std": 0.0017390131251886487,
      "step": 674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 743.78125,
      "completions/mean_terminated_length": 714.7930908203125,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 2.0833333333333335,
      "grad_norm": 2.077398647842541,
      "kl": 0.1700439453125,
      "learning_rate": 3.2886903695748647e-07,
      "loss": -0.1476,
      "num_tokens": 18867563.0,
      "reward": 0.010390140116214752,
      "reward_std": 0.019611971452832222,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.001127932919189334,
      "rewards/logprob_reward/std": 0.0022287664469331503,
      "step": 675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 647.96875,
      "completions/mean_terminated_length": 622.9000244140625,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 2.0864197530864197,
      "grad_norm": 3.081639433507794,
      "kl": 0.1923828125,
      "learning_rate": 3.2839447433571454e-07,
      "loss": -0.3706,
      "num_tokens": 18894266.0,
      "reward": 0.004312549717724323,
      "reward_std": 0.00768632534891367,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0013194999191910028,
      "rewards/logprob_reward/std": 0.0021036118268966675,
      "step": 676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1021.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 693.9375,
      "completions/mean_terminated_length": 693.9375,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 2.0895061728395063,
      "grad_norm": 1.7451066403736228,
      "kl": 0.197021484375,
      "learning_rate": 3.279195982174524e-07,
      "loss": -0.0397,
      "num_tokens": 18922744.0,
      "reward": 0.016954107210040092,
      "reward_std": 0.02626674249768257,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.001476786215789616,
      "rewards/logprob_reward/std": 0.002165104728192091,
      "step": 677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 778.5625,
      "completions/mean_terminated_length": 721.923095703125,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 2.0925925925925926,
      "grad_norm": 1.7709637996139636,
      "kl": 0.1868896484375,
      "learning_rate": 3.2744441050171136e-07,
      "loss": -0.0327,
      "num_tokens": 18954366.0,
      "reward": 0.017053451389074326,
      "reward_std": 0.02214268036186695,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.00158716703299433,
      "rewards/logprob_reward/std": 0.0030286668334156275,
      "step": 678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 674.21875,
      "completions/mean_terminated_length": 638.0344848632812,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 2.095679012345679,
      "grad_norm": 2.2810111545190623,
      "kl": 0.1983642578125,
      "learning_rate": 3.26968913088749e-07,
      "loss": -0.0857,
      "num_tokens": 18982253.0,
      "reward": 0.006142496131360531,
      "reward_std": 0.008889172226190567,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.003352772444486618,
      "rewards/logprob_reward/std": 0.006975427269935608,
      "step": 679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 890.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 607.5,
      "completions/mean_terminated_length": 607.5,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 2.0987654320987654,
      "grad_norm": 2.4110704442814,
      "kl": 0.195556640625,
      "learning_rate": 3.264931078800611e-07,
      "loss": -0.2248,
      "num_tokens": 19007681.0,
      "reward": 0.01696503534913063,
      "reward_std": 0.019929613918066025,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0014889281010255218,
      "rewards/logprob_reward/std": 0.0025145707186311483,
      "step": 680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 795.0,
      "completions/mean_length": 652.34375,
      "completions/mean_terminated_length": 627.5667114257812,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 2.1018518518518516,
      "grad_norm": 1.9426391620859809,
      "kl": 0.197265625,
      "learning_rate": 3.260169967783744e-07,
      "loss": -0.1323,
      "num_tokens": 19034720.0,
      "reward": 0.029838038608431816,
      "reward_std": 0.027410298585891724,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0019033756107091904,
      "rewards/logprob_reward/std": 0.0024241837672889233,
      "step": 681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 742.375,
      "completions/mean_terminated_length": 713.2413940429688,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 2.1049382716049383,
      "grad_norm": 2.548683868468059,
      "kl": 0.17132568359375,
      "learning_rate": 3.255405816876389e-07,
      "loss": -0.2088,
      "num_tokens": 19065240.0,
      "reward": 0.010009588673710823,
      "reward_std": 0.019502948969602585,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0007050983840599656,
      "rewards/logprob_reward/std": 0.0015305898850783706,
      "step": 682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 842.28125,
      "completions/mean_terminated_length": 771.1739501953125,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 2.1080246913580245,
      "grad_norm": 2.351741854366014,
      "kl": 0.19775390625,
      "learning_rate": 3.250638645130204e-07,
      "loss": -0.1794,
      "num_tokens": 19099337.0,
      "reward": 0.013157753273844719,
      "reward_std": 0.02523644268512726,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0007308362983167171,
      "rewards/logprob_reward/std": 0.002028500894084573,
      "step": 683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 689.21875,
      "completions/mean_terminated_length": 654.586181640625,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 2.111111111111111,
      "grad_norm": 1.7230649399481124,
      "kl": 0.21240234375,
      "learning_rate": 3.2458684716089224e-07,
      "loss": -0.1002,
      "num_tokens": 19127832.0,
      "reward": 0.014887186698615551,
      "reward_std": 0.02193785086274147,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0026524297427386045,
      "rewards/logprob_reward/std": 0.005362158641219139,
      "step": 684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1015.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 713.09375,
      "completions/mean_terminated_length": 713.09375,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 2.1141975308641974,
      "grad_norm": 2.10335220372187,
      "kl": 0.1942138671875,
      "learning_rate": 3.241095315388287e-07,
      "loss": -0.1358,
      "num_tokens": 19157211.0,
      "reward": 0.025678927078843117,
      "reward_std": 0.03614753484725952,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.004226584918797016,
      "rewards/logprob_reward/std": 0.007038444746285677,
      "step": 685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 667.0,
      "completions/mean_terminated_length": 655.4838256835938,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.117283950617284,
      "grad_norm": 2.013781515668275,
      "kl": 0.185302734375,
      "learning_rate": 3.2363191955559656e-07,
      "loss": -0.0637,
      "num_tokens": 19184643.0,
      "reward": 0.013850882649421692,
      "reward_std": 0.013981991447508335,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.001500982092693448,
      "rewards/logprob_reward/std": 0.0026388971600681543,
      "step": 686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 966.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 628.65625,
      "completions/mean_terminated_length": 628.65625,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 2.1203703703703702,
      "grad_norm": 1.7190092386605302,
      "kl": 0.23095703125,
      "learning_rate": 3.231540131211478e-07,
      "loss": -0.0553,
      "num_tokens": 19211044.0,
      "reward": 0.007108983118087053,
      "reward_std": 0.013545095920562744,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0009544256026856601,
      "rewards/logprob_reward/std": 0.0021833155769854784,
      "step": 687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 975.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 640.28125,
      "completions/mean_terminated_length": 640.28125,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 2.123456790123457,
      "grad_norm": 3.071812389989443,
      "kl": 0.1951904296875,
      "learning_rate": 3.22675814146612e-07,
      "loss": -0.2527,
      "num_tokens": 19237345.0,
      "reward": 0.022753456607460976,
      "reward_std": 0.03936662897467613,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0009760628454387188,
      "rewards/logprob_reward/std": 0.0018598815659061074,
      "step": 688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 841.0,
      "completions/mean_length": 640.15625,
      "completions/mean_terminated_length": 627.774169921875,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 2.126543209876543,
      "grad_norm": 2.2048609829488877,
      "kl": 0.2049560546875,
      "learning_rate": 3.221973245442883e-07,
      "loss": -0.1128,
      "num_tokens": 19263990.0,
      "reward": 0.011205012910068035,
      "reward_std": 0.015312530100345612,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0020333474967628717,
      "rewards/logprob_reward/std": 0.0030404385179281235,
      "step": 689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 777.78125,
      "completions/mean_terminated_length": 732.1851806640625,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 2.1296296296296298,
      "grad_norm": 1.634631378336532,
      "kl": 0.17413330078125,
      "learning_rate": 3.217185462276382e-07,
      "loss": -0.1329,
      "num_tokens": 19295775.0,
      "reward": 0.007836061529815197,
      "reward_std": 0.014449788257479668,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0017622908344492316,
      "rewards/logprob_reward/std": 0.003121594898402691,
      "step": 690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 708.8125,
      "completions/mean_terminated_length": 650.4444580078125,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 2.132716049382716,
      "grad_norm": 2.283304174740487,
      "kl": 0.2203369140625,
      "learning_rate": 3.2123948111127795e-07,
      "loss": -0.1059,
      "num_tokens": 19324805.0,
      "reward": 0.013735933229327202,
      "reward_std": 0.026334304362535477,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.001373259350657463,
      "rewards/logprob_reward/std": 0.002092954469844699,
      "step": 691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 733.8125,
      "completions/mean_terminated_length": 703.7930908203125,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 2.1358024691358026,
      "grad_norm": 3.544811007756336,
      "kl": 0.2158203125,
      "learning_rate": 3.2076013111097055e-07,
      "loss": -0.2391,
      "num_tokens": 19355451.0,
      "reward": 0.014487557113170624,
      "reward_std": 0.01965447887778282,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.005680619738996029,
      "rewards/logprob_reward/std": 0.00881699938327074,
      "step": 692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 757.15625,
      "completions/mean_terminated_length": 695.5769653320312,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 2.138888888888889,
      "grad_norm": 1.7368220628411621,
      "kl": 0.182373046875,
      "learning_rate": 3.20280498143618e-07,
      "loss": -0.0938,
      "num_tokens": 19386480.0,
      "reward": 0.010229920968413353,
      "reward_std": 0.014454798772931099,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0009499117149971426,
      "rewards/logprob_reward/std": 0.0017783971270546317,
      "step": 693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 698.40625,
      "completions/mean_terminated_length": 664.72412109375,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 2.1419753086419755,
      "grad_norm": 1.6837417520038844,
      "kl": 0.1942138671875,
      "learning_rate": 3.1980058412725436e-07,
      "loss": 0.0112,
      "num_tokens": 19415325.0,
      "reward": 0.013666713610291481,
      "reward_std": 0.02048582024872303,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0012963481713086367,
      "rewards/logprob_reward/std": 0.002260439097881317,
      "step": 694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 793.34375,
      "completions/mean_terminated_length": 740.1154174804688,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 2.1450617283950617,
      "grad_norm": 1.8016214015142245,
      "kl": 0.17626953125,
      "learning_rate": 3.1932039098103723e-07,
      "loss": -0.0566,
      "num_tokens": 19447440.0,
      "reward": 0.023225625976920128,
      "reward_std": 0.019267966970801353,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0015006960602477193,
      "rewards/logprob_reward/std": 0.002450818894430995,
      "step": 695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 774.59375,
      "completions/mean_terminated_length": 728.4074096679688,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 2.148148148148148,
      "grad_norm": 2.168445493493645,
      "kl": 0.1729736328125,
      "learning_rate": 3.188399206252406e-07,
      "loss": -0.0589,
      "num_tokens": 19479239.0,
      "reward": 0.010763797909021378,
      "reward_std": 0.018367238342761993,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0050153303891420364,
      "rewards/logprob_reward/std": 0.009430542588233948,
      "step": 696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 740.96875,
      "completions/mean_terminated_length": 688.5555419921875,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 2.1512345679012346,
      "grad_norm": 1.7668619588278764,
      "kl": 0.1787109375,
      "learning_rate": 3.183591749812468e-07,
      "loss": -0.1101,
      "num_tokens": 19509878.0,
      "reward": 0.011277096346020699,
      "reward_std": 0.015580618754029274,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0021134396083652973,
      "rewards/logprob_reward/std": 0.004006913397461176,
      "step": 697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 724.3125,
      "completions/mean_terminated_length": 640.3999633789062,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 2.154320987654321,
      "grad_norm": 2.014204884923914,
      "kl": 0.201416015625,
      "learning_rate": 3.1787815597153934e-07,
      "loss": -0.0338,
      "num_tokens": 19539788.0,
      "reward": 0.007484388537704945,
      "reward_std": 0.01386910118162632,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.001371542806737125,
      "rewards/logprob_reward/std": 0.0021594043355435133,
      "step": 698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 653.46875,
      "completions/mean_terminated_length": 641.51611328125,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 2.1574074074074074,
      "grad_norm": 2.0127198116107867,
      "kl": 0.2174072265625,
      "learning_rate": 3.173968655196947e-07,
      "loss": -0.0456,
      "num_tokens": 19566687.0,
      "reward": 0.010289727710187435,
      "reward_std": 0.014571724459528923,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0010163638507947326,
      "rewards/logprob_reward/std": 0.0018873271765187383,
      "step": 699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 804.34375,
      "completions/mean_terminated_length": 753.6538696289062,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 512.0,
      "epoch": 2.1604938271604937,
      "grad_norm": 2.6669954316042555,
      "kl": 0.17919921875,
      "learning_rate": 3.1691530555037493e-07,
      "loss": -0.2577,
      "num_tokens": 19599266.0,
      "reward": 0.013990317471325397,
      "reward_std": 0.02109478786587715,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0016559083014726639,
      "rewards/logprob_reward/std": 0.0022258798126131296,
      "step": 700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 744.46875,
      "completions/mean_terminated_length": 735.4515991210938,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 2.1635802469135803,
      "grad_norm": 2.5513428031933043,
      "kl": 0.16241455078125,
      "learning_rate": 3.164334779893198e-07,
      "loss": -0.2988,
      "num_tokens": 19630097.0,
      "reward": 0.003614734159782529,
      "reward_std": 0.003220351180061698,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.004016371443867683,
      "rewards/logprob_reward/std": 0.0058664362877607346,
      "step": 701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1006.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 731.71875,
      "completions/mean_terminated_length": 731.71875,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 2.1666666666666665,
      "grad_norm": 1.973620737873119,
      "kl": 0.176513671875,
      "learning_rate": 3.159513847633393e-07,
      "loss": -0.0696,
      "num_tokens": 19659864.0,
      "reward": 0.014876470901072025,
      "reward_std": 0.026599494740366936,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.002640522550791502,
      "rewards/logprob_reward/std": 0.0033077364787459373,
      "step": 702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 740.9375,
      "completions/mean_terminated_length": 722.0667114257812,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 2.169753086419753,
      "grad_norm": 1.8805013932396657,
      "kl": 0.1842041015625,
      "learning_rate": 3.1546902780030555e-07,
      "loss": -0.0033,
      "num_tokens": 19690006.0,
      "reward": 0.01714780181646347,
      "reward_std": 0.027524225413799286,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0016920019406825304,
      "rewards/logprob_reward/std": 0.002416391856968403,
      "step": 703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 713.0625,
      "completions/mean_terminated_length": 655.4815063476562,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 2.1728395061728394,
      "grad_norm": 2.9018920083774518,
      "kl": 0.21142578125,
      "learning_rate": 3.1498640902914565e-07,
      "loss": -0.1858,
      "num_tokens": 19719600.0,
      "reward": 0.015629183501005173,
      "reward_std": 0.022466342896223068,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.003476869547739625,
      "rewards/logprob_reward/std": 0.004384683445096016,
      "step": 704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 648.8125,
      "completions/mean_terminated_length": 636.7096557617188,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 2.175925925925926,
      "grad_norm": 1.8384344471623555,
      "kl": 0.2064208984375,
      "learning_rate": 3.1450353037983346e-07,
      "loss": -0.1312,
      "num_tokens": 19746418.0,
      "reward": 0.017334118485450745,
      "reward_std": 0.027243902906775475,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.001899020280689001,
      "rewards/logprob_reward/std": 0.003224569372832775,
      "step": 705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 684.84375,
      "completions/mean_terminated_length": 649.7586059570312,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 2.1790123456790123,
      "grad_norm": 1.7258756181759107,
      "kl": 0.2039794921875,
      "learning_rate": 3.140203937833821e-07,
      "loss": -0.038,
      "num_tokens": 19774681.0,
      "reward": 0.007824774831533432,
      "reward_std": 0.013877566903829575,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0017497497610747814,
      "rewards/logprob_reward/std": 0.0029178455006331205,
      "step": 706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 671.15625,
      "completions/mean_terminated_length": 634.6551513671875,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 2.182098765432099,
      "grad_norm": 3.5582507821007114,
      "kl": 0.21875,
      "learning_rate": 3.135370011718364e-07,
      "loss": -0.2457,
      "num_tokens": 19802574.0,
      "reward": 0.022976461797952652,
      "reward_std": 0.03377217799425125,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0012238477356731892,
      "rewards/logprob_reward/std": 0.002058252226561308,
      "step": 707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 675.09375,
      "completions/mean_terminated_length": 663.8386840820312,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 2.185185185185185,
      "grad_norm": 2.066657496155649,
      "kl": 0.218505859375,
      "learning_rate": 3.1305335447826477e-07,
      "loss": -0.0358,
      "num_tokens": 19830921.0,
      "reward": 0.027299972251057625,
      "reward_std": 0.0403885543346405,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0025555237662047148,
      "rewards/logprob_reward/std": 0.002870726864784956,
      "step": 708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 697.84375,
      "completions/mean_terminated_length": 687.3225708007812,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 2.1882716049382718,
      "grad_norm": 1.8649316470081814,
      "kl": 0.1871337890625,
      "learning_rate": 3.125694556367517e-07,
      "loss": -0.0515,
      "num_tokens": 19859400.0,
      "reward": 0.02298777922987938,
      "reward_std": 0.03882042318582535,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0012364235008135438,
      "rewards/logprob_reward/std": 0.0017622843151912093,
      "step": 709
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 831.40625,
      "completions/mean_terminated_length": 681.6111450195312,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 2.191358024691358,
      "grad_norm": 2.2503754641255838,
      "kl": NaN,
      "learning_rate": 3.1208530658239e-07,
      "loss": -0.0248,
      "num_tokens": 19893309.0,
      "reward": 0.00822516344487667,
      "reward_std": 0.01045928057283163,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.002194626722484827,
      "rewards/logprob_reward/std": 0.005198339931666851,
      "step": 710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 706.6875,
      "completions/mean_terminated_length": 696.4515991210938,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 2.1944444444444446,
      "grad_norm": 1.9686203354037635,
      "kl": 0.2001953125,
      "learning_rate": 3.1160090925127325e-07,
      "loss": -0.1546,
      "num_tokens": 19922259.0,
      "reward": 0.024451250210404396,
      "reward_std": 0.030137833207845688,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.002862499561160803,
      "rewards/logprob_reward/std": 0.004403434693813324,
      "step": 711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 706.3125,
      "completions/mean_terminated_length": 660.9285888671875,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 2.197530864197531,
      "grad_norm": 2.0815079933441116,
      "kl": 0.2012939453125,
      "learning_rate": 3.1111626558048777e-07,
      "loss": -0.1331,
      "num_tokens": 19951337.0,
      "reward": 0.01722552999854088,
      "reward_std": 0.02297770418226719,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0017783672083169222,
      "rewards/logprob_reward/std": 0.003334183944389224,
      "step": 712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 936.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 692.8125,
      "completions/mean_terminated_length": 692.8125,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 2.200617283950617,
      "grad_norm": 1.8152786652921813,
      "kl": 0.2042236328125,
      "learning_rate": 3.1063137750810493e-07,
      "loss": -0.063,
      "num_tokens": 19980051.0,
      "reward": 0.014371870085597038,
      "reward_std": 0.02127542346715927,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.002079855417832732,
      "rewards/logprob_reward/std": 0.0030030657071620226,
      "step": 713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 740.0,
      "completions/mean_terminated_length": 699.4285888671875,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 2.2037037037037037,
      "grad_norm": 3.001523321516778,
      "kl": 0.1907958984375,
      "learning_rate": 3.101462469731735e-07,
      "loss": -0.3082,
      "num_tokens": 20010515.0,
      "reward": 0.008616460487246513,
      "reward_std": 0.015965763479471207,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0026294009294360876,
      "rewards/logprob_reward/std": 0.004724571481347084,
      "step": 714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 669.1875,
      "completions/mean_terminated_length": 645.5333862304688,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 2.20679012345679,
      "grad_norm": 1.5780098412505246,
      "kl": 0.2098388671875,
      "learning_rate": 3.0966087591571184e-07,
      "loss": -0.0214,
      "num_tokens": 20038189.0,
      "reward": 0.02048753760755062,
      "reward_std": 0.022141898050904274,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001930593978613615,
      "rewards/logprob_reward/std": 0.003870361717417836,
      "step": 715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 685.90625,
      "completions/mean_terminated_length": 663.36669921875,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 2.2098765432098766,
      "grad_norm": 1.9906976149586653,
      "kl": 0.19647216796875,
      "learning_rate": 3.091752662767001e-07,
      "loss": -0.0742,
      "num_tokens": 20066510.0,
      "reward": 0.02095744013786316,
      "reward_std": 0.03419157490134239,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0024527120403945446,
      "rewards/logprob_reward/std": 0.004117546137422323,
      "step": 716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 667.84375,
      "completions/mean_terminated_length": 656.3547973632812,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 2.212962962962963,
      "grad_norm": 2.1312491887377787,
      "kl": 0.2139892578125,
      "learning_rate": 3.0868941999807274e-07,
      "loss": -0.1352,
      "num_tokens": 20094169.0,
      "reward": 0.011528492905199528,
      "reward_std": 0.020114785060286522,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.002392769791185856,
      "rewards/logprob_reward/std": 0.00551227293908596,
      "step": 717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 981.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 664.90625,
      "completions/mean_terminated_length": 664.90625,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 2.2160493827160495,
      "grad_norm": 1.819554122367732,
      "kl": 0.197265625,
      "learning_rate": 3.082033390227102e-07,
      "loss": 0.0268,
      "num_tokens": 20121990.0,
      "reward": 0.01730799302458763,
      "reward_std": 0.027153251692652702,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.001869992003776133,
      "rewards/logprob_reward/std": 0.0025917806196957827,
      "step": 718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 978.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 662.0625,
      "completions/mean_terminated_length": 662.0625,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 2.2191358024691357,
      "grad_norm": 1.7887861245248962,
      "kl": 0.1986083984375,
      "learning_rate": 3.0771702529443163e-07,
      "loss": -0.0369,
      "num_tokens": 20149468.0,
      "reward": 0.022415729239583015,
      "reward_std": 0.029145292937755585,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.004073033109307289,
      "rewards/logprob_reward/std": 0.009793604724109173,
      "step": 719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 707.4375,
      "completions/mean_terminated_length": 674.6896362304688,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 2.2222222222222223,
      "grad_norm": 2.473500654696141,
      "kl": 0.191650390625,
      "learning_rate": 3.0723048075798694e-07,
      "loss": -0.1671,
      "num_tokens": 20178214.0,
      "reward": 0.01057870127260685,
      "reward_std": 0.02028326690196991,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0013374458067119122,
      "rewards/logprob_reward/std": 0.002189180813729763,
      "step": 720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1003.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 667.53125,
      "completions/mean_terminated_length": 667.53125,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 2.2253086419753085,
      "grad_norm": 2.0823282597420847,
      "kl": 0.2186279296875,
      "learning_rate": 3.0674370735904917e-07,
      "loss": -0.1845,
      "num_tokens": 20205755.0,
      "reward": 0.02725653350353241,
      "reward_std": 0.03393562510609627,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0025072579737752676,
      "rewards/logprob_reward/std": 0.003850584151223302,
      "step": 721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 723.65625,
      "completions/mean_terminated_length": 680.75,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 2.228395061728395,
      "grad_norm": 2.1220284831023544,
      "kl": 0.20318603515625,
      "learning_rate": 3.0625670704420634e-07,
      "loss": -0.1226,
      "num_tokens": 20235716.0,
      "reward": 0.008036112412810326,
      "reward_std": 0.014271697029471397,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0019845697097480297,
      "rewards/logprob_reward/std": 0.0030064627062529325,
      "step": 722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 684.65625,
      "completions/mean_terminated_length": 662.0333862304688,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 2.2314814814814814,
      "grad_norm": 1.8710458481733792,
      "kl": 0.17779541015625,
      "learning_rate": 3.057694817609539e-07,
      "loss": -0.1808,
      "num_tokens": 20264109.0,
      "reward": 0.00783197395503521,
      "reward_std": 0.00933231133967638,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0017577486578375101,
      "rewards/logprob_reward/std": 0.0028715496882796288,
      "step": 723
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 738.21875,
      "completions/mean_terminated_length": 672.2692260742188,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 2.234567901234568,
      "grad_norm": 1.9014327670978726,
      "kl": NaN,
      "learning_rate": 3.0528203345768717e-07,
      "loss": -0.1328,
      "num_tokens": 20294540.0,
      "reward": 0.004208831116557121,
      "reward_std": 0.007450885139405727,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0012042568996548653,
      "rewards/logprob_reward/std": 0.0021676637697964907,
      "step": 724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 708.09375,
      "completions/mean_terminated_length": 697.9031982421875,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 2.2376543209876543,
      "grad_norm": 2.0511718997832578,
      "kl": 0.19927978515625,
      "learning_rate": 3.047943640836931e-07,
      "loss": -0.0949,
      "num_tokens": 20323371.0,
      "reward": 0.010843828320503235,
      "reward_std": 0.019358793273568153,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0016320315189659595,
      "rewards/logprob_reward/std": 0.002632023999467492,
      "step": 725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 670.4375,
      "completions/mean_terminated_length": 659.0322265625,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 2.240740740740741,
      "grad_norm": 1.985521889256597,
      "kl": 0.2191162109375,
      "learning_rate": 3.0430647558914284e-07,
      "loss": -0.1478,
      "num_tokens": 20350897.0,
      "reward": 0.0018935356056317687,
      "reward_std": 0.0018031983636319637,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0021039284765720367,
      "rewards/logprob_reward/std": 0.0025468331295996904,
      "step": 726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1022.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 682.1875,
      "completions/mean_terminated_length": 682.1875,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 2.243827160493827,
      "grad_norm": 3.0085913078454514,
      "kl": 0.158935546875,
      "learning_rate": 3.038183699250837e-07,
      "loss": -0.4066,
      "num_tokens": 20379207.0,
      "reward": 0.008154462091624737,
      "reward_std": 0.011152166873216629,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.005588292144238949,
      "rewards/logprob_reward/std": 0.01106759998947382,
      "step": 727
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 739.375,
      "completions/mean_terminated_length": 659.6799926757812,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 2.246913580246914,
      "grad_norm": 1.6009103603948496,
      "kl": NaN,
      "learning_rate": 3.0333004904343153e-07,
      "loss": -0.1468,
      "num_tokens": 20409499.0,
      "reward": 0.016564439982175827,
      "reward_std": 0.02129506506025791,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0010438221506774426,
      "rewards/logprob_reward/std": 0.001586288446560502,
      "step": 728
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 744.71875,
      "completions/mean_terminated_length": 635.434814453125,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 2.25,
      "grad_norm": 1.783686655560749,
      "kl": NaN,
      "learning_rate": 3.0284151489696264e-07,
      "loss": -0.0725,
      "num_tokens": 20440014.0,
      "reward": 0.011835969984531403,
      "reward_std": 0.015340530313551426,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0027344110421836376,
      "rewards/logprob_reward/std": 0.0038544186390936375,
      "step": 729
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 930.0,
      "completions/mean_length": 715.625,
      "completions/mean_terminated_length": 683.72412109375,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 2.253086419753086,
      "grad_norm": 1.7669001918411822,
      "kl": NaN,
      "learning_rate": 3.023527694393064e-07,
      "loss": -0.0642,
      "num_tokens": 20469546.0,
      "reward": 0.01330122072249651,
      "reward_std": 0.020615288987755775,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0008902450208552182,
      "rewards/logprob_reward/std": 0.0015324733685702085,
      "step": 730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1019.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 703.84375,
      "completions/mean_terminated_length": 703.84375,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 2.256172839506173,
      "grad_norm": 2.1691027905437767,
      "kl": 0.1790771484375,
      "learning_rate": 3.0186381462493704e-07,
      "loss": -0.171,
      "num_tokens": 20499109.0,
      "reward": 0.011748358607292175,
      "reward_std": 0.020492851734161377,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0026370638515800238,
      "rewards/logprob_reward/std": 0.004131729248911142,
      "step": 731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 640.1875,
      "completions/mean_terminated_length": 627.8064575195312,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 2.259259259259259,
      "grad_norm": 1.919132832371422,
      "kl": 0.20391845703125,
      "learning_rate": 3.0137465240916614e-07,
      "loss": -0.0194,
      "num_tokens": 20525571.0,
      "reward": 0.014449520036578178,
      "reward_std": 0.026038456708192825,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.002166133839637041,
      "rewards/logprob_reward/std": 0.0022073055151849985,
      "step": 732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 711.0625,
      "completions/mean_terminated_length": 690.2000122070312,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 2.2623456790123457,
      "grad_norm": 2.1348215835442,
      "kl": 0.2127685546875,
      "learning_rate": 3.008852847481346e-07,
      "loss": -0.0467,
      "num_tokens": 20555449.0,
      "reward": 0.0079883998259902,
      "reward_std": 0.014689110219478607,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0019315548706799746,
      "rewards/logprob_reward/std": 0.0034405828919261694,
      "step": 733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 725.03125,
      "completions/mean_terminated_length": 682.3214721679688,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 2.265432098765432,
      "grad_norm": 2.2587945330038504,
      "kl": 0.1964111328125,
      "learning_rate": 3.003957135988049e-07,
      "loss": -0.0865,
      "num_tokens": 20585418.0,
      "reward": 0.007533889263868332,
      "reward_std": 0.01407144870609045,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0014265429927036166,
      "rewards/logprob_reward/std": 0.002197315450757742,
      "step": 734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 828.0,
      "completions/mean_length": 680.96875,
      "completions/mean_terminated_length": 631.9642944335938,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 2.2685185185185186,
      "grad_norm": 2.180209869824383,
      "kl": 0.20733642578125,
      "learning_rate": 2.999059409189533e-07,
      "loss": -0.1038,
      "num_tokens": 20613861.0,
      "reward": 0.011373293586075306,
      "reward_std": 0.020540405064821243,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.002220326103270054,
      "rewards/logprob_reward/std": 0.00289405370131135,
      "step": 735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 697.21875,
      "completions/mean_terminated_length": 663.413818359375,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 2.271604938271605,
      "grad_norm": 1.948421055598178,
      "kl": 0.203125,
      "learning_rate": 2.9941596866716174e-07,
      "loss": -0.0492,
      "num_tokens": 20642348.0,
      "reward": 0.014036007225513458,
      "reward_std": 0.02121649496257305,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.001706675160676241,
      "rewards/logprob_reward/std": 0.002709642518311739,
      "step": 736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 634.84375,
      "completions/mean_terminated_length": 622.290283203125,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 2.2746913580246915,
      "grad_norm": 2.3766095069961355,
      "kl": 0.2034912109375,
      "learning_rate": 2.989257988028105e-07,
      "loss": -0.0669,
      "num_tokens": 20669211.0,
      "reward": 0.01749192178249359,
      "reward_std": 0.02235991507768631,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0020743580535054207,
      "rewards/logprob_reward/std": 0.0026480804663151503,
      "step": 737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 696.71875,
      "completions/mean_terminated_length": 674.9000244140625,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 2.2777777777777777,
      "grad_norm": 2.037501844544708,
      "kl": 0.1934814453125,
      "learning_rate": 2.984354332860702e-07,
      "loss": -0.0959,
      "num_tokens": 20697986.0,
      "reward": 0.01077186968177557,
      "reward_std": 0.015292023308575153,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.001552077243104577,
      "rewards/logprob_reward/std": 0.002849878277629614,
      "step": 738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 738.875,
      "completions/mean_terminated_length": 673.0769653320312,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 2.2808641975308643,
      "grad_norm": 1.8398893931923475,
      "kl": 0.2205810546875,
      "learning_rate": 2.979448740778935e-07,
      "loss": 0.0243,
      "num_tokens": 20728650.0,
      "reward": 0.014752760529518127,
      "reward_std": 0.02185913547873497,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0025030672550201416,
      "rewards/logprob_reward/std": 0.003537602722644806,
      "step": 739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1008.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 632.5625,
      "completions/mean_terminated_length": 632.5625,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 2.2839506172839505,
      "grad_norm": 2.339917624735448,
      "kl": 0.2362060546875,
      "learning_rate": 2.9745412314000786e-07,
      "loss": -0.097,
      "num_tokens": 20755072.0,
      "reward": 0.022999752312898636,
      "reward_std": 0.033915817737579346,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0012497229035943747,
      "rewards/logprob_reward/std": 0.001911089289933443,
      "step": 740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 670.40625,
      "completions/mean_terminated_length": 659.0,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 2.287037037037037,
      "grad_norm": 1.9800143544819198,
      "kl": 0.19384765625,
      "learning_rate": 2.9696318243490746e-07,
      "loss": -0.1877,
      "num_tokens": 20782865.0,
      "reward": 0.004813074134290218,
      "reward_std": 0.008067871443927288,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.001875637797638774,
      "rewards/logprob_reward/std": 0.0025465013459324837,
      "step": 741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 701.1875,
      "completions/mean_terminated_length": 679.6666870117188,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 2.2901234567901234,
      "grad_norm": 2.1524480397410843,
      "kl": 0.2032470703125,
      "learning_rate": 2.9647205392584533e-07,
      "loss": -0.1965,
      "num_tokens": 20812199.0,
      "reward": 0.006285310722887516,
      "reward_std": 0.009405326098203659,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.00351145607419312,
      "rewards/logprob_reward/std": 0.0050127143040299416,
      "step": 742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 818.0,
      "completions/mean_length": 636.5,
      "completions/mean_terminated_length": 610.6666870117188,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 2.29320987654321,
      "grad_norm": 2.1391421655950684,
      "kl": 0.2254638671875,
      "learning_rate": 2.959807395768255e-07,
      "loss": -0.1155,
      "num_tokens": 20839011.0,
      "reward": 0.010436173528432846,
      "reward_std": 0.01438200380653143,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0011790813878178596,
      "rewards/logprob_reward/std": 0.0016188162844628096,
      "step": 743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 732.25,
      "completions/mean_terminated_length": 690.5714721679688,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 2.2962962962962963,
      "grad_norm": 1.6584251680821471,
      "kl": 0.184326171875,
      "learning_rate": 2.95489241352595e-07,
      "loss": -0.0456,
      "num_tokens": 20869183.0,
      "reward": 0.011500661261379719,
      "reward_std": 0.016072984784841537,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0023618454579263926,
      "rewards/logprob_reward/std": 0.0040468983352184296,
      "step": 744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 865.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 625.3125,
      "completions/mean_terminated_length": 625.3125,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 2.299382716049383,
      "grad_norm": 2.3394841133481075,
      "kl": 0.2041015625,
      "learning_rate": 2.949975612186366e-07,
      "loss": -0.1045,
      "num_tokens": 20895609.0,
      "reward": 0.027013035491108894,
      "reward_std": 0.03591803461313248,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0022367057390511036,
      "rewards/logprob_reward/std": 0.0031810272485017776,
      "step": 745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 652.53125,
      "completions/mean_terminated_length": 614.1034545898438,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 2.302469135802469,
      "grad_norm": 2.0041419047184204,
      "kl": 0.2049560546875,
      "learning_rate": 2.9450570114116014e-07,
      "loss": -0.1177,
      "num_tokens": 20922798.0,
      "reward": 0.027759265154600143,
      "reward_std": 0.03280960023403168,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0030658484902232885,
      "rewards/logprob_reward/std": 0.0035908985882997513,
      "step": 746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1004.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 656.59375,
      "completions/mean_terminated_length": 656.59375,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 2.3055555555555554,
      "grad_norm": 2.0626102908524633,
      "kl": 0.225830078125,
      "learning_rate": 2.9401366308709513e-07,
      "loss": -0.0954,
      "num_tokens": 20950261.0,
      "reward": 0.02691406011581421,
      "reward_std": 0.03383072093129158,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.002126733073964715,
      "rewards/logprob_reward/std": 0.0029292285908013582,
      "step": 747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 694.34375,
      "completions/mean_terminated_length": 683.7096557617188,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 2.308641975308642,
      "grad_norm": 1.7855321890272653,
      "kl": 0.1859130859375,
      "learning_rate": 2.9352144902408296e-07,
      "loss": -0.0889,
      "num_tokens": 20979168.0,
      "reward": 0.020497024059295654,
      "reward_std": 0.03210761398077011,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001941136666573584,
      "rewards/logprob_reward/std": 0.0027287257835268974,
      "step": 748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 962.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 694.40625,
      "completions/mean_terminated_length": 694.40625,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 2.3117283950617282,
      "grad_norm": 2.0004801072760086,
      "kl": 0.173095703125,
      "learning_rate": 2.930290609204686e-07,
      "loss": -0.1498,
      "num_tokens": 21008037.0,
      "reward": 0.00931592471897602,
      "reward_std": 0.009273052215576172,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.003406583098694682,
      "rewards/logprob_reward/std": 0.0034958128817379475,
      "step": 749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 654.53125,
      "completions/mean_terminated_length": 642.6128540039062,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 2.314814814814815,
      "grad_norm": 1.7895581006013852,
      "kl": 0.19036865234375,
      "learning_rate": 2.925365007452933e-07,
      "loss": -0.0906,
      "num_tokens": 21035402.0,
      "reward": 0.01693890616297722,
      "reward_std": 0.02178483083844185,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0014598959824070334,
      "rewards/logprob_reward/std": 0.0020789767149835825,
      "step": 750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 785.0,
      "completions/mean_length": 612.4375,
      "completions/mean_terminated_length": 585.0,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 2.317901234567901,
      "grad_norm": 2.382818710179892,
      "kl": 0.2159423828125,
      "learning_rate": 2.920437704682861e-07,
      "loss": -0.0949,
      "num_tokens": 21061172.0,
      "reward": 0.026447393000125885,
      "reward_std": 0.040804147720336914,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0016082143411040306,
      "rewards/logprob_reward/std": 0.002333967015147209,
      "step": 751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 841.0,
      "completions/mean_length": 636.53125,
      "completions/mean_terminated_length": 624.0322265625,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 2.3209876543209877,
      "grad_norm": 1.8283921741212457,
      "kl": 0.1959228515625,
      "learning_rate": 2.915508720598566e-07,
      "loss": -0.0756,
      "num_tokens": 21087901.0,
      "reward": 0.012147173285484314,
      "reward_std": 0.014731649309396744,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0030801924876868725,
      "rewards/logprob_reward/std": 0.00503428652882576,
      "step": 752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 651.6875,
      "completions/mean_terminated_length": 626.86669921875,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 2.324074074074074,
      "grad_norm": 2.3386370058074952,
      "kl": 0.2027587890625,
      "learning_rate": 2.910578074910865e-07,
      "loss": -0.1045,
      "num_tokens": 21115203.0,
      "reward": 0.014293940737843513,
      "reward_std": 0.026815392076969147,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.001993267796933651,
      "rewards/logprob_reward/std": 0.003454646561294794,
      "step": 753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 837.0,
      "completions/max_terminated_length": 837.0,
      "completions/mean_length": 606.4375,
      "completions/mean_terminated_length": 606.4375,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 2.3271604938271606,
      "grad_norm": 2.5045503605738797,
      "kl": 0.212890625,
      "learning_rate": 2.9056457873372213e-07,
      "loss": -0.2066,
      "num_tokens": 21140657.0,
      "reward": 0.013928234577178955,
      "reward_std": 0.025971507653594017,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.001586928148753941,
      "rewards/logprob_reward/std": 0.002250204561278224,
      "step": 754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1010.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 674.125,
      "completions/mean_terminated_length": 674.125,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 2.330246913580247,
      "grad_norm": 1.8750311313989472,
      "kl": 0.203125,
      "learning_rate": 2.9007118776016635e-07,
      "loss": 0.0243,
      "num_tokens": 21168265.0,
      "reward": 0.016981391236186028,
      "reward_std": 0.02731180191040039,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0015071008820086718,
      "rewards/logprob_reward/std": 0.0024619167670607567,
      "step": 755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 688.46875,
      "completions/mean_terminated_length": 666.1000366210938,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 2.3333333333333335,
      "grad_norm": 2.0285058126328632,
      "kl": 0.17169189453125,
      "learning_rate": 2.895776365434706e-07,
      "loss": -0.101,
      "num_tokens": 21196824.0,
      "reward": 0.002055669669061899,
      "reward_std": 0.002533155959099531,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00228407746180892,
      "rewards/logprob_reward/std": 0.003654703265056014,
      "step": 756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 625.5,
      "completions/mean_terminated_length": 612.6451416015625,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 2.3364197530864197,
      "grad_norm": 3.9096325385449853,
      "kl": 0.2318115234375,
      "learning_rate": 2.8908392705732724e-07,
      "loss": -0.2157,
      "num_tokens": 21222676.0,
      "reward": 0.014098500832915306,
      "reward_std": 0.016845598816871643,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0017761120107024908,
      "rewards/logprob_reward/std": 0.00431130500510335,
      "step": 757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 693.09375,
      "completions/mean_terminated_length": 671.0333862304688,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 2.3395061728395063,
      "grad_norm": 2.0621807247628516,
      "kl": 0.19091796875,
      "learning_rate": 2.885900612760616e-07,
      "loss": -0.0017,
      "num_tokens": 21250915.0,
      "reward": 0.013744791969656944,
      "reward_std": 0.025930313393473625,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0013831022661179304,
      "rewards/logprob_reward/std": 0.0021195383742451668,
      "step": 758
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 723.59375,
      "completions/mean_terminated_length": 654.2692260742188,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 2.3425925925925926,
      "grad_norm": 2.130005701801698,
      "kl": NaN,
      "learning_rate": 2.8809604117462397e-07,
      "loss": -0.2003,
      "num_tokens": 21280958.0,
      "reward": 0.01306244358420372,
      "reward_std": 0.021564170718193054,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.004097159951925278,
      "rewards/logprob_reward/std": 0.0075044953264296055,
      "step": 759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 624.96875,
      "completions/mean_terminated_length": 612.0967407226562,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 2.3456790123456788,
      "grad_norm": 2.2385336775800586,
      "kl": 0.22021484375,
      "learning_rate": 2.876018687285817e-07,
      "loss": -0.0829,
      "num_tokens": 21307321.0,
      "reward": 0.02025846391916275,
      "reward_std": 0.033152103424072266,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0016760698053985834,
      "rewards/logprob_reward/std": 0.0024185858201235533,
      "step": 760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 921.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 668.625,
      "completions/mean_terminated_length": 668.625,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 2.3487654320987654,
      "grad_norm": 2.222865996326742,
      "kl": 0.1905517578125,
      "learning_rate": 2.8710754591411147e-07,
      "loss": -0.1101,
      "num_tokens": 21335093.0,
      "reward": 0.020237958058714867,
      "reward_std": 0.02622999995946884,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0016532877925783396,
      "rewards/logprob_reward/std": 0.0030789622105658054,
      "step": 761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1018.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 606.3125,
      "completions/mean_terminated_length": 606.3125,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 2.351851851851852,
      "grad_norm": 2.6081438099890977,
      "kl": 0.21533203125,
      "learning_rate": 2.8661307470799114e-07,
      "loss": -0.2491,
      "num_tokens": 21360947.0,
      "reward": 0.039200570434331894,
      "reward_std": 0.05209661275148392,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0018895207904279232,
      "rewards/logprob_reward/std": 0.003134438069537282,
      "step": 762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 683.09375,
      "completions/mean_terminated_length": 647.8275756835938,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 2.3549382716049383,
      "grad_norm": 1.9614533144375295,
      "kl": 0.19000244140625,
      "learning_rate": 2.861184570875921e-07,
      "loss": -0.0871,
      "num_tokens": 21389146.0,
      "reward": 0.014616047963500023,
      "reward_std": 0.026574091985821724,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0023511643521487713,
      "rewards/logprob_reward/std": 0.004739148076623678,
      "step": 763
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 932.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 627.0625,
      "completions/mean_terminated_length": 627.0625,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 2.3580246913580245,
      "grad_norm": 1.7626662455342699,
      "kl": NaN,
      "learning_rate": 2.856236950308711e-07,
      "loss": -0.1747,
      "num_tokens": 21415380.0,
      "reward": 0.011208342388272285,
      "reward_std": 0.01498295646160841,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0020370474085211754,
      "rewards/logprob_reward/std": 0.0030780963134020567,
      "step": 764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 684.09375,
      "completions/mean_terminated_length": 661.433349609375,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 2.361111111111111,
      "grad_norm": 1.9784413179253255,
      "kl": 0.1986083984375,
      "learning_rate": 2.851287905163628e-07,
      "loss": -0.0547,
      "num_tokens": 21443735.0,
      "reward": 0.029958879575133324,
      "reward_std": 0.039155617356300354,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0020376425236463547,
      "rewards/logprob_reward/std": 0.0034716313239187002,
      "step": 765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 635.15625,
      "completions/mean_terminated_length": 622.6129150390625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 2.3641975308641974,
      "grad_norm": 3.031221502195575,
      "kl": 0.220947265625,
      "learning_rate": 2.8463374552317123e-07,
      "loss": -0.3255,
      "num_tokens": 21470648.0,
      "reward": 0.012188144959509373,
      "reward_std": 0.016201891005039215,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0031257164664566517,
      "rewards/logprob_reward/std": 0.0069906204007565975,
      "step": 766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 681.71875,
      "completions/mean_terminated_length": 646.3103637695312,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 2.367283950617284,
      "grad_norm": 2.1293540852434014,
      "kl": 0.1748046875,
      "learning_rate": 2.8413856203096226e-07,
      "loss": -0.1594,
      "num_tokens": 21499035.0,
      "reward": 0.001801763428375125,
      "reward_std": 0.001622794196009636,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.00200195936486125,
      "rewards/logprob_reward/std": 0.003044778248295188,
      "step": 767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 659.0625,
      "completions/mean_terminated_length": 634.7333374023438,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 2.3703703703703702,
      "grad_norm": 1.7041509254321938,
      "kl": 0.19854736328125,
      "learning_rate": 2.836432420199557e-07,
      "loss": 0.0039,
      "num_tokens": 21526845.0,
      "reward": 0.024135377258062363,
      "reward_std": 0.03447246551513672,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.002511532511562109,
      "rewards/logprob_reward/std": 0.004762453492730856,
      "step": 768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 690.5,
      "completions/mean_terminated_length": 668.2667236328125,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 2.373456790123457,
      "grad_norm": 1.9538570419641474,
      "kl": 0.189208984375,
      "learning_rate": 2.831477874709172e-07,
      "loss": -0.0272,
      "num_tokens": 21555429.0,
      "reward": 0.020408859476447105,
      "reward_std": 0.03313077613711357,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0018431773642078042,
      "rewards/logprob_reward/std": 0.0025917550083249807,
      "step": 769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 919.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 633.5,
      "completions/mean_terminated_length": 633.5,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 2.376543209876543,
      "grad_norm": 2.1641262080253942,
      "kl": 0.2119140625,
      "learning_rate": 2.826522003651504e-07,
      "loss": -0.0668,
      "num_tokens": 21581841.0,
      "reward": 0.01095607504248619,
      "reward_std": 0.02049342915415764,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0017567494651302695,
      "rewards/logprob_reward/std": 0.002133831148967147,
      "step": 770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 857.0,
      "completions/mean_length": 677.28125,
      "completions/mean_terminated_length": 654.1666870117188,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 2.3796296296296298,
      "grad_norm": 1.7747904490039623,
      "kl": 0.17822265625,
      "learning_rate": 2.8215648268448926e-07,
      "loss": -0.0883,
      "num_tokens": 21610214.0,
      "reward": 0.006160065997391939,
      "reward_std": 0.008916087448596954,
      "rewards/format_reward_func/mean": 0.03125,
      "rewards/format_reward_func/std": 0.1767766922712326,
      "rewards/logprob_reward/mean": 0.0033722955267876387,
      "rewards/logprob_reward/std": 0.003799574449658394,
      "step": 771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 918.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 601.59375,
      "completions/mean_terminated_length": 601.59375,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 2.382716049382716,
      "grad_norm": 3.416045127994316,
      "kl": 0.19635009765625,
      "learning_rate": 2.8166063641128963e-07,
      "loss": -0.3115,
      "num_tokens": 21635393.0,
      "reward": 0.016752969473600388,
      "reward_std": 0.02682165801525116,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0012532987166196108,
      "rewards/logprob_reward/std": 0.0017938826931640506,
      "step": 772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 776.0,
      "completions/mean_length": 642.25,
      "completions/mean_terminated_length": 587.7142944335938,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 2.3858024691358026,
      "grad_norm": 2.6297709005463186,
      "kl": 0.2100830078125,
      "learning_rate": 2.8116466352842165e-07,
      "loss": -0.139,
      "num_tokens": 21662481.0,
      "reward": 0.02676667645573616,
      "reward_std": 0.034649819135665894,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.001962975598871708,
      "rewards/logprob_reward/std": 0.0030351963359862566,
      "step": 773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 674.875,
      "completions/mean_terminated_length": 625.0,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 2.388888888888889,
      "grad_norm": 2.7441401584018723,
      "kl": 0.208984375,
      "learning_rate": 2.80668566019262e-07,
      "loss": -0.1109,
      "num_tokens": 21690513.0,
      "reward": 0.020455727353692055,
      "reward_std": 0.02898206189274788,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0018952535465359688,
      "rewards/logprob_reward/std": 0.003361431183293462,
      "step": 774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 640.6875,
      "completions/mean_terminated_length": 628.3225708007812,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 2.3919753086419755,
      "grad_norm": 2.423581119859638,
      "kl": 0.19287109375,
      "learning_rate": 2.8017234586768534e-07,
      "loss": -0.2178,
      "num_tokens": 21717463.0,
      "reward": 0.018702922388911247,
      "reward_std": 0.022699924185872078,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0034199138171970844,
      "rewards/logprob_reward/std": 0.0043744854629039764,
      "step": 775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 797.0,
      "completions/mean_length": 683.21875,
      "completions/mean_terminated_length": 604.5769653320312,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 2.3950617283950617,
      "grad_norm": 2.218899276677366,
      "kl": 0.21234130859375,
      "learning_rate": 2.796760050580571e-07,
      "loss": -0.1398,
      "num_tokens": 21745822.0,
      "reward": 0.0223326925188303,
      "reward_std": 0.02641984634101391,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0005085471784695983,
      "rewards/logprob_reward/std": 0.0008823273237794638,
      "step": 776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 967.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 633.59375,
      "completions/mean_terminated_length": 633.59375,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 2.398148148148148,
      "grad_norm": 2.3842149240513777,
      "kl": 0.179443359375,
      "learning_rate": 2.7917954557522503e-07,
      "loss": -0.2196,
      "num_tokens": 21772169.0,
      "reward": 0.03312259167432785,
      "reward_std": 0.033782489597797394,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002080656588077545,
      "rewards/logprob_reward/std": 0.0032138347160071135,
      "step": 777
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 714.0,
      "completions/mean_terminated_length": 642.4615478515625,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 2.4012345679012346,
      "grad_norm": 2.0869436884326116,
      "kl": NaN,
      "learning_rate": 2.786829694045116e-07,
      "loss": -0.0507,
      "num_tokens": 21802057.0,
      "reward": 0.01728595420718193,
      "reward_std": 0.02657473087310791,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0018455050885677338,
      "rewards/logprob_reward/std": 0.0020782107021659613,
      "step": 778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 817.0,
      "completions/max_terminated_length": 817.0,
      "completions/mean_length": 561.875,
      "completions/mean_terminated_length": 561.875,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 2.4043209876543212,
      "grad_norm": 2.100845147856409,
      "kl": 0.200927734375,
      "learning_rate": 2.7818627853170585e-07,
      "loss": -0.0829,
      "num_tokens": 21826125.0,
      "reward": 0.0206417478621006,
      "reward_std": 0.027341028675436974,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.002101942664012313,
      "rewards/logprob_reward/std": 0.0026918782386928797,
      "step": 779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 691.03125,
      "completions/mean_terminated_length": 643.4642944335938,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 2.4074074074074074,
      "grad_norm": 2.1858736493189777,
      "kl": 0.20977783203125,
      "learning_rate": 2.7768947494305545e-07,
      "loss": -0.1138,
      "num_tokens": 21854738.0,
      "reward": 0.017663007602095604,
      "reward_std": 0.026448804885149002,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.002264453563839197,
      "rewards/logprob_reward/std": 0.0034026289358735085,
      "step": 780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 665.71875,
      "completions/mean_terminated_length": 614.5357666015625,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 2.4104938271604937,
      "grad_norm": 2.126362146413349,
      "kl": 0.21826171875,
      "learning_rate": 2.7719256062525884e-07,
      "loss": -0.0783,
      "num_tokens": 21882865.0,
      "reward": 0.023511648178100586,
      "reward_std": 0.020995650440454483,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.005290718283504248,
      "rewards/logprob_reward/std": 0.0067299045622348785,
      "step": 781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1010.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 568.03125,
      "completions/mean_terminated_length": 568.03125,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 2.4135802469135803,
      "grad_norm": 2.45437341508095,
      "kl": 0.2205810546875,
      "learning_rate": 2.766955375654573e-07,
      "loss": -0.2238,
      "num_tokens": 21907334.0,
      "reward": 0.018289368599653244,
      "reward_std": 0.02329649031162262,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.002960409503430128,
      "rewards/logprob_reward/std": 0.005423862487077713,
      "step": 782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 619.1875,
      "completions/mean_terminated_length": 606.1290283203125,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 2.4166666666666665,
      "grad_norm": 2.077958899749018,
      "kl": 0.203369140625,
      "learning_rate": 2.7619840775122695e-07,
      "loss": -0.1114,
      "num_tokens": 21933496.0,
      "reward": 0.011950278654694557,
      "reward_std": 0.02096518874168396,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.002861420623958111,
      "rewards/logprob_reward/std": 0.0045641642063856125,
      "step": 783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 907.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 622.90625,
      "completions/mean_terminated_length": 622.90625,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 2.419753086419753,
      "grad_norm": 1.9110171745488564,
      "kl": 0.20361328125,
      "learning_rate": 2.7570117317057087e-07,
      "loss": -0.14,
      "num_tokens": 21960053.0,
      "reward": 0.018233656883239746,
      "reward_std": 0.02679438516497612,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0028985063545405865,
      "rewards/logprob_reward/std": 0.0035074797924607992,
      "step": 784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 990.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 641.3125,
      "completions/mean_terminated_length": 641.3125,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 2.4228395061728394,
      "grad_norm": 1.955324665689567,
      "kl": 0.2054443359375,
      "learning_rate": 2.7520383581191085e-07,
      "loss": -0.2198,
      "num_tokens": 21987327.0,
      "reward": 0.01924588903784752,
      "reward_std": 0.026994815096259117,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.00402320921421051,
      "rewards/logprob_reward/std": 0.005341153126209974,
      "step": 785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 641.53125,
      "completions/mean_terminated_length": 601.9655151367188,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 2.425925925925926,
      "grad_norm": 1.967916964949879,
      "kl": 0.2139892578125,
      "learning_rate": 2.7470639766408003e-07,
      "loss": -0.0426,
      "num_tokens": 22014296.0,
      "reward": 0.017452334985136986,
      "reward_std": 0.02039685659110546,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0020303723867982626,
      "rewards/logprob_reward/std": 0.004384672734886408,
      "step": 786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 663.3125,
      "completions/mean_terminated_length": 651.6774291992188,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 2.4290123456790123,
      "grad_norm": 2.049183593122295,
      "kl": 0.2091064453125,
      "learning_rate": 2.7420886071631455e-07,
      "loss": -0.1806,
      "num_tokens": 22041906.0,
      "reward": 0.017709145322442055,
      "reward_std": 0.02079470083117485,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0023157179821282625,
      "rewards/logprob_reward/std": 0.0030351937748491764,
      "step": 787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 653.71875,
      "completions/mean_terminated_length": 641.774169921875,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 2.432098765432099,
      "grad_norm": 2.005760721591516,
      "kl": 0.239501953125,
      "learning_rate": 2.7371122695824534e-07,
      "loss": -0.0891,
      "num_tokens": 22069113.0,
      "reward": 0.015844479203224182,
      "reward_std": 0.021281344816088676,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0037160879001021385,
      "rewards/logprob_reward/std": 0.004673448856920004,
      "step": 788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 686.6875,
      "completions/mean_terminated_length": 675.8064575195312,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 2.435185185185185,
      "grad_norm": 2.271091804404587,
      "kl": 0.1875,
      "learning_rate": 2.732134983798907e-07,
      "loss": -0.1317,
      "num_tokens": 22098127.0,
      "reward": 0.026305291801691055,
      "reward_std": 0.03931676968932152,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.004922546446323395,
      "rewards/logprob_reward/std": 0.0043291207402944565,
      "step": 789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 844.0,
      "completions/max_terminated_length": 844.0,
      "completions/mean_length": 632.46875,
      "completions/mean_terminated_length": 632.46875,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 2.4382716049382718,
      "grad_norm": 2.1275371201597326,
      "kl": 0.19384765625,
      "learning_rate": 2.727156769716482e-07,
      "loss": -0.0638,
      "num_tokens": 22124702.0,
      "reward": 0.012196826748549938,
      "reward_std": 0.01979811303317547,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0031353633385151625,
      "rewards/logprob_reward/std": 0.003473072312772274,
      "step": 790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 624.46875,
      "completions/mean_terminated_length": 597.8333740234375,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 2.441358024691358,
      "grad_norm": 2.1957550340046064,
      "kl": 0.231201171875,
      "learning_rate": 2.722177647242863e-07,
      "loss": -0.0802,
      "num_tokens": 22150957.0,
      "reward": 0.020226188004016876,
      "reward_std": 0.03338390588760376,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0016402084147557616,
      "rewards/logprob_reward/std": 0.0022355131804943085,
      "step": 791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 734.9375,
      "completions/mean_terminated_length": 654.0,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 2.4444444444444446,
      "grad_norm": 1.6726344757151257,
      "kl": 0.20458984375,
      "learning_rate": 2.717197636289373e-07,
      "loss": -0.0819,
      "num_tokens": 22181163.0,
      "reward": 0.01065262220799923,
      "reward_std": 0.014610957354307175,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0014195798430591822,
      "rewards/logprob_reward/std": 0.0018534038681536913,
      "step": 792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 821.0,
      "completions/max_terminated_length": 821.0,
      "completions/mean_length": 580.6875,
      "completions/mean_terminated_length": 580.6875,
      "completions/min_length": 306.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 2.447530864197531,
      "grad_norm": 2.398142707509085,
      "kl": 0.2135009765625,
      "learning_rate": 2.712216756770881e-07,
      "loss": -0.1378,
      "num_tokens": 22205769.0,
      "reward": 0.02064337022602558,
      "reward_std": 0.025822220370173454,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0021037464030086994,
      "rewards/logprob_reward/std": 0.0022530886344611645,
      "step": 793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 968.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 648.09375,
      "completions/mean_terminated_length": 648.09375,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 2.450617283950617,
      "grad_norm": 3.0934505561772836,
      "kl": 0.2025146484375,
      "learning_rate": 2.7072350286057354e-07,
      "loss": -0.1571,
      "num_tokens": 22232980.0,
      "reward": 0.025620292872190475,
      "reward_std": 0.02903948351740837,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.00416143424808979,
      "rewards/logprob_reward/std": 0.00495970668271184,
      "step": 794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 701.15625,
      "completions/mean_terminated_length": 679.6333618164062,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 2.4537037037037037,
      "grad_norm": 2.1450314466149716,
      "kl": 0.1929931640625,
      "learning_rate": 2.7022524717156734e-07,
      "loss": -0.0682,
      "num_tokens": 22261977.0,
      "reward": 0.02857239916920662,
      "reward_std": 0.03375323861837387,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.003969332203269005,
      "rewards/logprob_reward/std": 0.004325473215430975,
      "step": 795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 683.375,
      "completions/mean_terminated_length": 672.3870849609375,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 2.45679012345679,
      "grad_norm": 2.0853127274576515,
      "kl": 0.191162109375,
      "learning_rate": 2.6972691060257504e-07,
      "loss": -0.0678,
      "num_tokens": 22290609.0,
      "reward": 0.018175773322582245,
      "reward_std": 0.027988426387310028,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0028341934084892273,
      "rewards/logprob_reward/std": 0.005148232914507389,
      "step": 796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 683.8125,
      "completions/mean_terminated_length": 661.1333618164062,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 2.4598765432098766,
      "grad_norm": 2.270771040940409,
      "kl": 0.221923828125,
      "learning_rate": 2.6922849514642524e-07,
      "loss": -0.1624,
      "num_tokens": 22319203.0,
      "reward": 0.028170034289360046,
      "reward_std": 0.035853706300258636,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.003522261045873165,
      "rewards/logprob_reward/std": 0.004629835020750761,
      "step": 797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 701.125,
      "completions/mean_terminated_length": 690.7096557617188,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 2.462962962962963,
      "grad_norm": 2.3139829790799915,
      "kl": 0.19635009765625,
      "learning_rate": 2.687300027962624e-07,
      "loss": -0.0852,
      "num_tokens": 22348239.0,
      "reward": 0.008503954857587814,
      "reward_std": 0.0144048435613513,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.002504394156858325,
      "rewards/logprob_reward/std": 0.002979145385324955,
      "step": 798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 663.0625,
      "completions/mean_terminated_length": 639.0000610351562,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 2.4660493827160495,
      "grad_norm": 1.9387656971456555,
      "kl": 0.1888427734375,
      "learning_rate": 2.682314355455381e-07,
      "loss": -0.0454,
      "num_tokens": 22375397.0,
      "reward": 0.03912090137600899,
      "reward_std": 0.04738355427980423,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0018010009080171585,
      "rewards/logprob_reward/std": 0.00249595008790493,
      "step": 799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 854.0,
      "completions/mean_length": 647.1875,
      "completions/mean_terminated_length": 635.0322265625,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 2.4691358024691357,
      "grad_norm": 1.9088482438239143,
      "kl": 0.206787109375,
      "learning_rate": 2.677327953880038e-07,
      "loss": -0.1478,
      "num_tokens": 22402259.0,
      "reward": 0.03538230061531067,
      "reward_std": 0.03400729224085808,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0011192248202860355,
      "rewards/logprob_reward/std": 0.0020716898143291473,
      "step": 800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 706.1875,
      "completions/mean_terminated_length": 660.7857666015625,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 2.4722222222222223,
      "grad_norm": 2.2538632437104162,
      "kl": 0.2005615234375,
      "learning_rate": 2.6723408431770214e-07,
      "loss": -0.0677,
      "num_tokens": 22431425.0,
      "reward": 0.017258968204259872,
      "reward_std": 0.03199339658021927,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0018155192956328392,
      "rewards/logprob_reward/std": 0.003149349009618163,
      "step": 801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 631.8125,
      "completions/mean_terminated_length": 619.1612548828125,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 2.4753086419753085,
      "grad_norm": 3.179859102788341,
      "kl": 0.2052001953125,
      "learning_rate": 2.6673530432895957e-07,
      "loss": -0.2108,
      "num_tokens": 22458075.0,
      "reward": 0.021727275103330612,
      "reward_std": 0.03393279388546944,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0033080829307436943,
      "rewards/logprob_reward/std": 0.004199848975986242,
      "step": 802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1010.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 626.46875,
      "completions/mean_terminated_length": 626.46875,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 2.478395061728395,
      "grad_norm": 2.048019564442794,
      "kl": 0.20166015625,
      "learning_rate": 2.6623645741637815e-07,
      "loss": -0.0589,
      "num_tokens": 22484558.0,
      "reward": 0.01799776963889599,
      "reward_std": 0.027030106633901596,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0026364100631326437,
      "rewards/logprob_reward/std": 0.0041060373187065125,
      "step": 803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 673.4375,
      "completions/mean_terminated_length": 662.1290283203125,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 2.4814814814814814,
      "grad_norm": 2.267452814467895,
      "kl": 0.21453857421875,
      "learning_rate": 2.6573754557482746e-07,
      "loss": -0.2337,
      "num_tokens": 22512584.0,
      "reward": 0.023072022944688797,
      "reward_std": 0.02745337039232254,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.001330025726929307,
      "rewards/logprob_reward/std": 0.003039197064936161,
      "step": 804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 659.25,
      "completions/mean_terminated_length": 647.4838256835938,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 2.484567901234568,
      "grad_norm": 2.2584024761330634,
      "kl": 0.1944580078125,
      "learning_rate": 2.652385707994369e-07,
      "loss": -0.1697,
      "num_tokens": 22540008.0,
      "reward": 0.019889328628778458,
      "reward_std": 0.028179999440908432,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0012659190688282251,
      "rewards/logprob_reward/std": 0.0020133298821747303,
      "step": 805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 674.96875,
      "completions/mean_terminated_length": 663.7096557617188,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 2.4876543209876543,
      "grad_norm": 4.047315712520667,
      "kl": 0.1915283203125,
      "learning_rate": 2.6473953508558726e-07,
      "loss": -0.2301,
      "num_tokens": 22567819.0,
      "reward": 0.020216289907693863,
      "reward_std": 0.026102395728230476,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0016292117070406675,
      "rewards/logprob_reward/std": 0.0024890329223126173,
      "step": 806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 849.0,
      "completions/mean_length": 629.0625,
      "completions/mean_terminated_length": 616.3225708007812,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 2.490740740740741,
      "grad_norm": 3.8122308876763054,
      "kl": 0.2374267578125,
      "learning_rate": 2.6424044042890334e-07,
      "loss": -0.2644,
      "num_tokens": 22594225.0,
      "reward": 0.04215528443455696,
      "reward_std": 0.04140203446149826,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0017003151588141918,
      "rewards/logprob_reward/std": 0.0036566979251801968,
      "step": 807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 886.0,
      "completions/max_terminated_length": 886.0,
      "completions/mean_length": 634.53125,
      "completions/mean_terminated_length": 634.53125,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 2.493827160493827,
      "grad_norm": 2.045740451584124,
      "kl": 0.2281494140625,
      "learning_rate": 2.6374128882524527e-07,
      "loss": -0.118,
      "num_tokens": 22620978.0,
      "reward": 0.030714647844433784,
      "reward_std": 0.04138539358973503,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.002877388149499893,
      "rewards/logprob_reward/std": 0.003089474281296134,
      "step": 808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 716.625,
      "completions/mean_terminated_length": 672.7142944335938,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 2.496913580246914,
      "grad_norm": 3.7451071903248905,
      "kl": 0.20947265625,
      "learning_rate": 2.6324208227070136e-07,
      "loss": -0.5801,
      "num_tokens": 22650606.0,
      "reward": 0.010864447802305222,
      "reward_std": 0.020829197019338608,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0016549410065636039,
      "rewards/logprob_reward/std": 0.003017139621078968,
      "step": 809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 664.5,
      "completions/mean_terminated_length": 652.9031982421875,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 2.5,
      "grad_norm": 2.2161602841140726,
      "kl": 0.2120361328125,
      "learning_rate": 2.6274282276157934e-07,
      "loss": -0.1445,
      "num_tokens": 22678430.0,
      "reward": 0.04231654107570648,
      "reward_std": 0.034627120941877365,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0018794930074363947,
      "rewards/logprob_reward/std": 0.0027959332801401615,
      "step": 810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 712.5625,
      "completions/mean_terminated_length": 691.800048828125,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 2.503086419753086,
      "grad_norm": 3.9039260299922915,
      "kl": 0.1927490234375,
      "learning_rate": 2.622435122943987e-07,
      "loss": -0.5333,
      "num_tokens": 22708060.0,
      "reward": 0.02357512339949608,
      "reward_std": 0.039907120168209076,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0018890247447416186,
      "rewards/logprob_reward/std": 0.0037410426884889603,
      "step": 811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 733.875,
      "completions/mean_terminated_length": 666.923095703125,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 2.506172839506173,
      "grad_norm": 7.871507714768179,
      "kl": 0.2376708984375,
      "learning_rate": 2.61744152865883e-07,
      "loss": -0.604,
      "num_tokens": 22738440.0,
      "reward": 0.01712499000132084,
      "reward_std": 0.02763485349714756,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0016666557639837265,
      "rewards/logprob_reward/std": 0.003386562690138817,
      "step": 812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 709.96875,
      "completions/mean_terminated_length": 622.0399780273438,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 2.5092592592592595,
      "grad_norm": 2.550898122770325,
      "kl": 0.2625732421875,
      "learning_rate": 2.6124474647295137e-07,
      "loss": -0.1239,
      "num_tokens": 22767875.0,
      "reward": 0.027525482699275017,
      "reward_std": 0.03265233337879181,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.002806092379614711,
      "rewards/logprob_reward/std": 0.00510747404769063,
      "step": 813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 688.28125,
      "completions/mean_terminated_length": 665.9000244140625,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 2.5123456790123457,
      "grad_norm": 2.9279018313591623,
      "kl": 0.2076416015625,
      "learning_rate": 2.607452951127107e-07,
      "loss": -0.3979,
      "num_tokens": 22796600.0,
      "reward": 0.017985306680202484,
      "reward_std": 0.022175882011651993,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0026225619949400425,
      "rewards/logprob_reward/std": 0.0036902620922774076,
      "step": 814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 793.40625,
      "completions/mean_terminated_length": 728.8399658203125,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 2.515432098765432,
      "grad_norm": 1.9669109131970839,
      "kl": 0.171630859375,
      "learning_rate": 2.6024580078244777e-07,
      "loss": -0.081,
      "num_tokens": 22828749.0,
      "reward": 0.021482855081558228,
      "reward_std": 0.029232092201709747,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0030365039128810167,
      "rewards/logprob_reward/std": 0.0033765204716473818,
      "step": 815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 715.15625,
      "completions/mean_terminated_length": 671.0357666015625,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 2.5185185185185186,
      "grad_norm": 2.721919174748601,
      "kl": 0.2049560546875,
      "learning_rate": 2.5974626547962127e-07,
      "loss": -0.2804,
      "num_tokens": 22859026.0,
      "reward": 0.008617338724434376,
      "reward_std": 0.00868584681302309,
      "rewards/format_reward_func/mean": 0.0625,
      "rewards/format_reward_func/std": 0.24593468010425568,
      "rewards/logprob_reward/mean": 0.0026303762570023537,
      "rewards/logprob_reward/std": 0.003620925359427929,
      "step": 816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 746.25,
      "completions/mean_terminated_length": 717.5172119140625,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 2.521604938271605,
      "grad_norm": 2.222073599850611,
      "kl": 0.2247314453125,
      "learning_rate": 2.5924669120185373e-07,
      "loss": -0.2009,
      "num_tokens": 22890034.0,
      "reward": 0.023714452981948853,
      "reward_std": 0.027255091816186905,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0020438386127352715,
      "rewards/logprob_reward/std": 0.003529217327013612,
      "step": 817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 977.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 661.71875,
      "completions/mean_terminated_length": 661.71875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 2.5246913580246915,
      "grad_norm": 2.088454467765236,
      "kl": 0.207275390625,
      "learning_rate": 2.5874707994692333e-07,
      "loss": -0.3222,
      "num_tokens": 22917609.0,
      "reward": 0.018701478838920593,
      "reward_std": 0.02833986096084118,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0034183121751993895,
      "rewards/logprob_reward/std": 0.0037787938490509987,
      "step": 818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 702.78125,
      "completions/mean_terminated_length": 692.4193115234375,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 2.5277777777777777,
      "grad_norm": 3.260481603818947,
      "kl": 0.2135009765625,
      "learning_rate": 2.582474337127564e-07,
      "loss": -0.2878,
      "num_tokens": 22946986.0,
      "reward": 0.027376752346754074,
      "reward_std": 0.028328103944659233,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.002640834078192711,
      "rewards/logprob_reward/std": 0.004320894833654165,
      "step": 819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 715.75,
      "completions/mean_terminated_length": 695.2000122070312,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 2.5308641975308643,
      "grad_norm": 1.851039622210421,
      "kl": 0.1866455078125,
      "learning_rate": 2.5774775449741903e-07,
      "loss": -0.1446,
      "num_tokens": 22976878.0,
      "reward": 0.02392280474305153,
      "reward_std": 0.03221181035041809,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0022753365337848663,
      "rewards/logprob_reward/std": 0.003072317922487855,
      "step": 820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 907.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 658.9375,
      "completions/mean_terminated_length": 658.9375,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 2.5339506172839505,
      "grad_norm": 2.694360369586984,
      "kl": 0.2193603515625,
      "learning_rate": 2.572480442991092e-07,
      "loss": -0.2354,
      "num_tokens": 23004184.0,
      "reward": 0.030364010483026505,
      "reward_std": 0.04005251079797745,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0024877884425222874,
      "rewards/logprob_reward/std": 0.00370772578753531,
      "step": 821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 664.03125,
      "completions/mean_terminated_length": 640.0333862304688,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 2.537037037037037,
      "grad_norm": 2.2410612913781907,
      "kl": 0.215087890625,
      "learning_rate": 2.567483051161487e-07,
      "loss": -0.0733,
      "num_tokens": 23031981.0,
      "reward": 0.03331879526376724,
      "reward_std": 0.03510475903749466,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0022986605763435364,
      "rewards/logprob_reward/std": 0.002696947194635868,
      "step": 822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 738.875,
      "completions/mean_terminated_length": 709.3793334960938,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 2.5401234567901234,
      "grad_norm": 2.2639579831015446,
      "kl": 0.2003173828125,
      "learning_rate": 2.562485389469754e-07,
      "loss": -0.2896,
      "num_tokens": 23062461.0,
      "reward": 0.020506957545876503,
      "reward_std": 0.028530918061733246,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001952175865881145,
      "rewards/logprob_reward/std": 0.0033695560414344072,
      "step": 823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 720.125,
      "completions/mean_terminated_length": 710.3225708007812,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 2.5432098765432096,
      "grad_norm": 2.476754768832393,
      "kl": 0.216796875,
      "learning_rate": 2.5574874779013494e-07,
      "loss": -0.3659,
      "num_tokens": 23091893.0,
      "reward": 0.010837879031896591,
      "reward_std": 0.02046077884733677,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0016254207585006952,
      "rewards/logprob_reward/std": 0.0024387193843722343,
      "step": 824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 716.25,
      "completions/mean_terminated_length": 672.2857666015625,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 2.5462962962962963,
      "grad_norm": 2.8502985601277833,
      "kl": 0.20654296875,
      "learning_rate": 2.5524893364427307e-07,
      "loss": -0.3099,
      "num_tokens": 23121433.0,
      "reward": 0.03331666439771652,
      "reward_std": 0.03405371308326721,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002296295017004013,
      "rewards/logprob_reward/std": 0.0034785487223416567,
      "step": 825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 727.6875,
      "completions/mean_terminated_length": 718.1290283203125,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 2.549382716049383,
      "grad_norm": 1.9951048258107709,
      "kl": 0.180908203125,
      "learning_rate": 2.547490985081272e-07,
      "loss": -0.2309,
      "num_tokens": 23151375.0,
      "reward": 0.017598789185285568,
      "reward_std": 0.031999118626117706,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0021930988878011703,
      "rewards/logprob_reward/std": 0.0025571477599442005,
      "step": 826
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 696.34375,
      "completions/mean_terminated_length": 674.5000610351562,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 2.552469135802469,
      "grad_norm": 2.8159746377764665,
      "kl": NaN,
      "learning_rate": 2.5424924438051896e-07,
      "loss": -0.2667,
      "num_tokens": 23180022.0,
      "reward": 0.029780089855194092,
      "reward_std": 0.044473059475421906,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0018389882752671838,
      "rewards/logprob_reward/std": 0.0028411380480974913,
      "step": 827
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 747.75,
      "completions/mean_terminated_length": 719.1724243164062,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 2.5555555555555554,
      "grad_norm": 4.467702325213489,
      "kl": NaN,
      "learning_rate": 2.5374937326034575e-07,
      "loss": -0.774,
      "num_tokens": 23210766.0,
      "reward": 0.014227262698113918,
      "reward_std": 0.022986872121691704,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0019191803876310587,
      "rewards/logprob_reward/std": 0.005398834589868784,
      "step": 828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 669.15625,
      "completions/mean_terminated_length": 645.5000610351562,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 2.558641975308642,
      "grad_norm": 2.597076161518784,
      "kl": 0.2498779296875,
      "learning_rate": 2.5324948714657287e-07,
      "loss": -0.2013,
      "num_tokens": 23238795.0,
      "reward": 0.03445756062865257,
      "reward_std": 0.028805706650018692,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0035639547277241945,
      "rewards/logprob_reward/std": 0.0068948217667639256,
      "step": 829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 705.4375,
      "completions/mean_terminated_length": 672.4827270507812,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 2.5617283950617287,
      "grad_norm": 3.653506477903024,
      "kl": 0.21246337890625,
      "learning_rate": 2.527495880382259e-07,
      "loss": -0.4878,
      "num_tokens": 23268021.0,
      "reward": 0.01364973746240139,
      "reward_std": 0.026206474751234055,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0012774853967130184,
      "rewards/logprob_reward/std": 0.0029412677977234125,
      "step": 830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 960.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 724.375,
      "completions/mean_terminated_length": 724.375,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 2.564814814814815,
      "grad_norm": 2.309741392394042,
      "kl": 0.2120361328125,
      "learning_rate": 2.522496779343819e-07,
      "loss": -0.1435,
      "num_tokens": 23297769.0,
      "reward": 0.03863968327641487,
      "reward_std": 0.05230659246444702,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0012663148809224367,
      "rewards/logprob_reward/std": 0.002999991411343217,
      "step": 831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 722.1875,
      "completions/mean_terminated_length": 690.9655151367188,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 2.567901234567901,
      "grad_norm": 3.3358968568484353,
      "kl": 0.216064453125,
      "learning_rate": 2.5174975883416237e-07,
      "loss": -0.5637,
      "num_tokens": 23327459.0,
      "reward": 0.03565439581871033,
      "reward_std": 0.047280535101890564,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.001421550172381103,
      "rewards/logprob_reward/std": 0.003314180066809058,
      "step": 832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 715.46875,
      "completions/mean_terminated_length": 694.9000244140625,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 2.5709876543209877,
      "grad_norm": 2.702658914359402,
      "kl": 0.212890625,
      "learning_rate": 2.512498327367245e-07,
      "loss": -0.4353,
      "num_tokens": 23356742.0,
      "reward": 0.03318271040916443,
      "reward_std": 0.04089856147766113,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002147453837096691,
      "rewards/logprob_reward/std": 0.003950577694922686,
      "step": 833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 719.15625,
      "completions/mean_terminated_length": 698.8333740234375,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 2.574074074074074,
      "grad_norm": 3.3366556553222115,
      "kl": 0.224365234375,
      "learning_rate": 2.5074990164125355e-07,
      "loss": -0.4936,
      "num_tokens": 23386711.0,
      "reward": 0.020330991595983505,
      "reward_std": 0.03460453450679779,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001756657613441348,
      "rewards/logprob_reward/std": 0.003462655935436487,
      "step": 834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 708.96875,
      "completions/mean_terminated_length": 676.3793334960938,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 2.5771604938271606,
      "grad_norm": 2.6519424685060122,
      "kl": 0.2386474609375,
      "learning_rate": 2.502499675469547e-07,
      "loss": -0.3995,
      "num_tokens": 23416294.0,
      "reward": 0.032521802932024,
      "reward_std": 0.04797051474452019,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0014131118077784777,
      "rewards/logprob_reward/std": 0.003906686324626207,
      "step": 835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 693.4375,
      "completions/mean_terminated_length": 682.774169921875,
      "completions/min_length": 533.0,
      "completions/min_terminated_length": 533.0,
      "epoch": 2.580246913580247,
      "grad_norm": 3.67101673480092,
      "kl": 0.23876953125,
      "learning_rate": 2.497500324530453e-07,
      "loss": -0.4025,
      "num_tokens": 23444636.0,
      "reward": 0.02910676598548889,
      "reward_std": 0.04685968905687332,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0010908524272963405,
      "rewards/logprob_reward/std": 0.002023884328082204,
      "step": 836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 668.28125,
      "completions/mean_terminated_length": 656.8064575195312,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 2.5833333333333335,
      "grad_norm": 3.253404379602226,
      "kl": 0.2213134765625,
      "learning_rate": 2.4925009835874643e-07,
      "loss": -0.3962,
      "num_tokens": 23472589.0,
      "reward": 0.02016962505877018,
      "reward_std": 0.02743956819176674,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001577360788360238,
      "rewards/logprob_reward/std": 0.002759169088676572,
      "step": 837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 671.65625,
      "completions/mean_terminated_length": 660.290283203125,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 2.5864197530864197,
      "grad_norm": 2.1602120687780175,
      "kl": 0.2156982421875,
      "learning_rate": 2.4875016726327555e-07,
      "loss": -0.1269,
      "num_tokens": 23500206.0,
      "reward": 0.026714034378528595,
      "reward_std": 0.039035614579916,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.001904481090605259,
      "rewards/logprob_reward/std": 0.00641883397474885,
      "step": 838
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 740.625,
      "completions/mean_terminated_length": 700.1428833007812,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 2.5895061728395063,
      "grad_norm": 2.3325872334653863,
      "kl": NaN,
      "learning_rate": 2.482502411658376e-07,
      "loss": -0.3661,
      "num_tokens": 23530850.0,
      "reward": 0.0106188440695405,
      "reward_std": 0.015147138386964798,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.001382049173116684,
      "rewards/logprob_reward/std": 0.002947068540379405,
      "step": 839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 782.4375,
      "completions/mean_terminated_length": 714.7999877929688,
      "completions/min_length": 498.0,
      "completions/min_terminated_length": 498.0,
      "epoch": 2.5925925925925926,
      "grad_norm": 2.544350818063004,
      "kl": 0.2039794921875,
      "learning_rate": 2.477503220656181e-07,
      "loss": -0.3391,
      "num_tokens": 23562580.0,
      "reward": 0.010148586705327034,
      "reward_std": 0.019542181864380836,
      "rewards/format_reward_func/mean": 0.09375,
      "rewards/format_reward_func/std": 0.2961445748806,
      "rewards/logprob_reward/mean": 0.0008595405961386859,
      "rewards/logprob_reward/std": 0.0020047107245773077,
      "step": 840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 718.0,
      "completions/mean_terminated_length": 674.2857666015625,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 2.5956790123456788,
      "grad_norm": 1.9610370735118756,
      "kl": 0.2322998046875,
      "learning_rate": 2.472504119617742e-07,
      "loss": -0.2352,
      "num_tokens": 23592020.0,
      "reward": 0.05466719716787338,
      "reward_std": 0.03430184721946716,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0017135508824139833,
      "rewards/logprob_reward/std": 0.00216072634793818,
      "step": 841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1022.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 674.125,
      "completions/mean_terminated_length": 674.125,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 2.5987654320987654,
      "grad_norm": 2.7523464152444657,
      "kl": 0.2215576171875,
      "learning_rate": 2.4675051285342716e-07,
      "loss": -0.4303,
      "num_tokens": 23619552.0,
      "reward": 0.026860298588871956,
      "reward_std": 0.03700836002826691,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.002066997578367591,
      "rewards/logprob_reward/std": 0.00514443451538682,
      "step": 842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 703.84375,
      "completions/mean_terminated_length": 693.51611328125,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 2.601851851851852,
      "grad_norm": 2.214071112523842,
      "kl": 0.224609375,
      "learning_rate": 2.462506267396543e-07,
      "loss": -0.2276,
      "num_tokens": 23648431.0,
      "reward": 0.025821613147854805,
      "reward_std": 0.039692893624305725,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0009129041573032737,
      "rewards/logprob_reward/std": 0.0021135048009455204,
      "step": 843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 718.78125,
      "completions/mean_terminated_length": 698.433349609375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 2.6049382716049383,
      "grad_norm": 2.786763201629874,
      "kl": 0.2379150390625,
      "learning_rate": 2.45750755619481e-07,
      "loss": -0.4413,
      "num_tokens": 23677764.0,
      "reward": 0.03020767867565155,
      "reward_std": 0.04247771203517914,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0023140886332839727,
      "rewards/logprob_reward/std": 0.005226579960435629,
      "step": 844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 813.84375,
      "completions/mean_terminated_length": 774.9259033203125,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 2.6080246913580245,
      "grad_norm": 1.7725783149245817,
      "kl": 0.221923828125,
      "learning_rate": 2.452509014918728e-07,
      "loss": -0.0991,
      "num_tokens": 23711003.0,
      "reward": 0.016469093039631844,
      "reward_std": 0.026728259399533272,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0009378820541314781,
      "rewards/logprob_reward/std": 0.0023578214459121227,
      "step": 845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 756.65625,
      "completions/mean_terminated_length": 748.0322265625,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 2.611111111111111,
      "grad_norm": 2.220694192968226,
      "kl": 0.22216796875,
      "learning_rate": 2.4475106635572696e-07,
      "loss": -0.1817,
      "num_tokens": 23741380.0,
      "reward": 0.01998922973871231,
      "reward_std": 0.0389016717672348,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001376922708004713,
      "rewards/logprob_reward/std": 0.003074995242059231,
      "step": 846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 743.71875,
      "completions/mean_terminated_length": 734.6773681640625,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 2.6141975308641974,
      "grad_norm": 2.4805680619076704,
      "kl": 0.2080078125,
      "learning_rate": 2.4425125220986503e-07,
      "loss": -0.4004,
      "num_tokens": 23771663.0,
      "reward": 0.03915968909859657,
      "reward_std": 0.05278148874640465,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0018440973944962025,
      "rewards/logprob_reward/std": 0.003013347741216421,
      "step": 847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 736.8125,
      "completions/mean_terminated_length": 695.7857666015625,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 2.617283950617284,
      "grad_norm": 2.5417168398216003,
      "kl": 0.20831298828125,
      "learning_rate": 2.437514610530246e-07,
      "loss": -0.474,
      "num_tokens": 23801965.0,
      "reward": 0.033397383987903595,
      "reward_std": 0.03566242754459381,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0023859834764152765,
      "rewards/logprob_reward/std": 0.0035746481735259295,
      "step": 848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 673.34375,
      "completions/mean_terminated_length": 662.0322265625,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 2.6203703703703702,
      "grad_norm": 2.3381554960645095,
      "kl": 0.223876953125,
      "learning_rate": 2.4325169488385137e-07,
      "loss": -0.2657,
      "num_tokens": 23829732.0,
      "reward": 0.032207190990448,
      "reward_std": 0.038205452263355255,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0010635434882715344,
      "rewards/logprob_reward/std": 0.001865375554189086,
      "step": 849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 691.75,
      "completions/mean_terminated_length": 681.0322265625,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 2.623456790123457,
      "grad_norm": 1.986900300570951,
      "kl": 0.249755859375,
      "learning_rate": 2.4275195570089083e-07,
      "loss": -0.1838,
      "num_tokens": 23858240.0,
      "reward": 0.05186006799340248,
      "reward_std": 0.046564631164073944,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0020667435601353645,
      "rewards/logprob_reward/std": 0.004226405173540115,
      "step": 850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 814.0,
      "completions/mean_length": 594.40625,
      "completions/mean_terminated_length": 580.54833984375,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 2.626543209876543,
      "grad_norm": 3.1367713294332105,
      "kl": 0.2412109375,
      "learning_rate": 2.42252245502581e-07,
      "loss": -0.2543,
      "num_tokens": 23883045.0,
      "reward": 0.03827198967337608,
      "reward_std": 0.054491765797138214,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0008577638072893023,
      "rewards/logprob_reward/std": 0.001706449780613184,
      "step": 851
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 743.625,
      "completions/mean_terminated_length": 714.6206665039062,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 2.6296296296296298,
      "grad_norm": 1.8859280096829483,
      "kl": NaN,
      "learning_rate": 2.417525662872436e-07,
      "loss": -0.0967,
      "num_tokens": 23913497.0,
      "reward": 0.030198421329259872,
      "reward_std": 0.03990299627184868,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.002303801476955414,
      "rewards/logprob_reward/std": 0.0065719569101929665,
      "step": 852
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 749.875,
      "completions/mean_terminated_length": 721.5172119140625,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 2.632716049382716,
      "grad_norm": 1.8682637632133128,
      "kl": NaN,
      "learning_rate": 2.412529200530767e-07,
      "loss": -0.0972,
      "num_tokens": 23944309.0,
      "reward": 0.020021267235279083,
      "reward_std": 0.03287472575902939,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0014125206507742405,
      "rewards/logprob_reward/std": 0.0031592841260135174,
      "step": 853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 673.03125,
      "completions/mean_terminated_length": 622.8928833007812,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 2.6358024691358026,
      "grad_norm": 2.147540991945435,
      "kl": 0.2291259765625,
      "learning_rate": 2.407533087981463e-07,
      "loss": -0.1666,
      "num_tokens": 23971974.0,
      "reward": 0.04172077775001526,
      "reward_std": 0.047027163207530975,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.008161972276866436,
      "rewards/logprob_reward/std": 0.0175464004278183,
      "step": 854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 744.46875,
      "completions/mean_terminated_length": 735.4515991210938,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 2.638888888888889,
      "grad_norm": 1.9951436098050117,
      "kl": 0.2181396484375,
      "learning_rate": 2.4025373452037865e-07,
      "loss": -0.0855,
      "num_tokens": 24002289.0,
      "reward": 0.02945689484477043,
      "reward_std": 0.040991902351379395,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0014798822812736034,
      "rewards/logprob_reward/std": 0.002451105508953333,
      "step": 855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 772.28125,
      "completions/mean_terminated_length": 746.2413940429688,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 2.6419753086419755,
      "grad_norm": 2.1854130021683753,
      "kl": 0.199951171875,
      "learning_rate": 2.3975419921755215e-07,
      "loss": -0.1726,
      "num_tokens": 24033878.0,
      "reward": 0.038735829293727875,
      "reward_std": 0.03313373401761055,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0013731422368437052,
      "rewards/logprob_reward/std": 0.002503145020455122,
      "step": 856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 691.5625,
      "completions/mean_terminated_length": 669.4000244140625,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 2.6450617283950617,
      "grad_norm": 2.2495599348922664,
      "kl": 0.2222900390625,
      "learning_rate": 2.3925470488728935e-07,
      "loss": -0.1941,
      "num_tokens": 24062368.0,
      "reward": 0.036605220288038254,
      "reward_std": 0.04203525185585022,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0024780225940048695,
      "rewards/logprob_reward/std": 0.004517507739365101,
      "step": 857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 926.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 662.5,
      "completions/mean_terminated_length": 662.5,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 2.648148148148148,
      "grad_norm": 2.343706151366428,
      "kl": 0.2281494140625,
      "learning_rate": 2.3875525352704866e-07,
      "loss": -0.1816,
      "num_tokens": 24090336.0,
      "reward": 0.050402089953422546,
      "reward_std": 0.039860792458057404,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003918987698853016,
      "rewards/logprob_reward/std": 0.0071139344945549965,
      "step": 858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 803.0,
      "completions/mean_terminated_length": 780.137939453125,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 2.6512345679012346,
      "grad_norm": 2.090071290877971,
      "kl": 0.214111328125,
      "learning_rate": 2.38255847134117e-07,
      "loss": -0.2867,
      "num_tokens": 24123040.0,
      "reward": 0.039301685988903046,
      "reward_std": 0.04914182424545288,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0020018750801682472,
      "rewards/logprob_reward/std": 0.005134669132530689,
      "step": 859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 733.625,
      "completions/mean_terminated_length": 714.2667236328125,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.6543209876543212,
      "grad_norm": 2.1483396464727327,
      "kl": 0.2054443359375,
      "learning_rate": 2.3775648770560126e-07,
      "loss": -0.236,
      "num_tokens": 24153412.0,
      "reward": 0.023131113499403,
      "reward_std": 0.03914899379014969,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0013956804759800434,
      "rewards/logprob_reward/std": 0.004638405051082373,
      "step": 860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 975.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 698.53125,
      "completions/mean_terminated_length": 698.53125,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 2.6574074074074074,
      "grad_norm": 2.0154457417961686,
      "kl": 0.23779296875,
      "learning_rate": 2.3725717723842066e-07,
      "loss": -0.2103,
      "num_tokens": 24182273.0,
      "reward": 0.03179170563817024,
      "reward_std": 0.03794276341795921,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.004074117634445429,
      "rewards/logprob_reward/std": 0.007332983892410994,
      "step": 861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 694.5625,
      "completions/mean_terminated_length": 660.4827270507812,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 2.6604938271604937,
      "grad_norm": 2.035982964371428,
      "kl": 0.2576904296875,
      "learning_rate": 2.3675791772929862e-07,
      "loss": -0.1806,
      "num_tokens": 24211475.0,
      "reward": 0.03286074101924896,
      "reward_std": 0.0356038361787796,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0017897089710459113,
      "rewards/logprob_reward/std": 0.0033139868173748255,
      "step": 862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 687.03125,
      "completions/mean_terminated_length": 664.5667114257812,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 2.6635802469135803,
      "grad_norm": 3.2998292410758596,
      "kl": 0.2371826171875,
      "learning_rate": 2.3625871117475466e-07,
      "loss": -0.4995,
      "num_tokens": 24239900.0,
      "reward": 0.04468598589301109,
      "reward_std": 0.052429646253585815,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0010399832390248775,
      "rewards/logprob_reward/std": 0.002016523154452443,
      "step": 863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 741.375,
      "completions/mean_terminated_length": 712.137939453125,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 2.6666666666666665,
      "grad_norm": 2.566695161347095,
      "kl": 0.25341796875,
      "learning_rate": 2.357595595710967e-07,
      "loss": -0.2856,
      "num_tokens": 24270364.0,
      "reward": 0.019286027178168297,
      "reward_std": 0.025570526719093323,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0005955855594947934,
      "rewards/logprob_reward/std": 0.0013773522805422544,
      "step": 864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 741.8125,
      "completions/mean_terminated_length": 723.0000610351562,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 2.669753086419753,
      "grad_norm": 2.1976330674905262,
      "kl": 0.2265625,
      "learning_rate": 2.3526046491441277e-07,
      "loss": -0.2576,
      "num_tokens": 24300518.0,
      "reward": 0.036166466772556305,
      "reward_std": 0.0490657202899456,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.001990516670048237,
      "rewards/logprob_reward/std": 0.003708133241161704,
      "step": 865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 716.40625,
      "completions/mean_terminated_length": 659.4444580078125,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 2.6728395061728394,
      "grad_norm": 2.320208401868147,
      "kl": 0.2210693359375,
      "learning_rate": 2.3476142920056315e-07,
      "loss": -0.1642,
      "num_tokens": 24330003.0,
      "reward": 0.04185473173856735,
      "reward_std": 0.03403524309396744,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0013663668651133776,
      "rewards/logprob_reward/std": 0.0019420768367126584,
      "step": 866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 679.96875,
      "completions/mean_terminated_length": 668.8709716796875,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 2.675925925925926,
      "grad_norm": 2.3944614516156175,
      "kl": 0.2408447265625,
      "learning_rate": 2.3426245442517254e-07,
      "loss": -0.3096,
      "num_tokens": 24357998.0,
      "reward": 0.03848707303404808,
      "reward_std": 0.03939800336956978,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0010967478156089783,
      "rewards/logprob_reward/std": 0.002325907815247774,
      "step": 867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 739.6875,
      "completions/mean_terminated_length": 720.7333984375,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 2.6790123456790123,
      "grad_norm": 2.7347047246556753,
      "kl": 0.223388671875,
      "learning_rate": 2.3376354258362185e-07,
      "loss": -0.2714,
      "num_tokens": 24388320.0,
      "reward": 0.04037487506866455,
      "reward_std": 0.05308530479669571,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.003194308839738369,
      "rewards/logprob_reward/std": 0.006363058928400278,
      "step": 868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 745.21875,
      "completions/mean_terminated_length": 726.6333618164062,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 2.682098765432099,
      "grad_norm": 1.995427564199402,
      "kl": 0.229248046875,
      "learning_rate": 2.3326469567104044e-07,
      "loss": -0.2556,
      "num_tokens": 24418655.0,
      "reward": 0.0455198772251606,
      "reward_std": 0.046911582350730896,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001966530457139015,
      "rewards/logprob_reward/std": 0.004726557061076164,
      "step": 869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 731.84375,
      "completions/mean_terminated_length": 701.6206665039062,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 2.685185185185185,
      "grad_norm": 2.1619837369227914,
      "kl": 0.2369384765625,
      "learning_rate": 2.3276591568229787e-07,
      "loss": -0.1859,
      "num_tokens": 24448690.0,
      "reward": 0.031650789082050323,
      "reward_std": 0.03955671191215515,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.00044532225001603365,
      "rewards/logprob_reward/std": 0.001193919568322599,
      "step": 870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 691.03125,
      "completions/mean_terminated_length": 668.8333740234375,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 2.6882716049382713,
      "grad_norm": 2.187276421691549,
      "kl": 0.258544921875,
      "learning_rate": 2.3226720461199626e-07,
      "loss": -0.2818,
      "num_tokens": 24477479.0,
      "reward": 0.04434952139854431,
      "reward_std": 0.0494009368121624,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0006661323131993413,
      "rewards/logprob_reward/std": 0.0018652883591130376,
      "step": 871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 735.0625,
      "completions/mean_terminated_length": 715.800048828125,
      "completions/min_length": 535.0,
      "completions/min_terminated_length": 535.0,
      "epoch": 2.691358024691358,
      "grad_norm": 3.1963819380802896,
      "kl": 0.20294189453125,
      "learning_rate": 2.3176856445446187e-07,
      "loss": -0.4824,
      "num_tokens": 24507285.0,
      "reward": 0.04160226508975029,
      "reward_std": 0.04009556770324707,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0010858499445021152,
      "rewards/logprob_reward/std": 0.0035910236183553934,
      "step": 872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 655.5625,
      "completions/mean_terminated_length": 643.6774291992188,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 2.6944444444444446,
      "grad_norm": 2.311400684915605,
      "kl": 0.2564697265625,
      "learning_rate": 2.3126999720373757e-07,
      "loss": -0.2976,
      "num_tokens": 24534211.0,
      "reward": 0.044940121471881866,
      "reward_std": 0.04768058657646179,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0013223541900515556,
      "rewards/logprob_reward/std": 0.0023013753816485405,
      "step": 873
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 727.3125,
      "completions/mean_terminated_length": 717.741943359375,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 2.697530864197531,
      "grad_norm": 2.054519713179906,
      "kl": NaN,
      "learning_rate": 2.3077150485357477e-07,
      "loss": -0.2575,
      "num_tokens": 24564097.0,
      "reward": 0.016626127064228058,
      "reward_std": 0.02705797366797924,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0011123641161248088,
      "rewards/logprob_reward/std": 0.0025833887048065662,
      "step": 874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 692.96875,
      "completions/mean_terminated_length": 670.9000244140625,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 2.700617283950617,
      "grad_norm": 2.910370041402984,
      "kl": 0.2257080078125,
      "learning_rate": 2.3027308939742502e-07,
      "loss": -0.4352,
      "num_tokens": 24592792.0,
      "reward": 0.04235769063234329,
      "reward_std": 0.04238816350698471,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0019252125639468431,
      "rewards/logprob_reward/std": 0.0034259967505931854,
      "step": 875
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 761.40625,
      "completions/mean_terminated_length": 687.8800048828125,
      "completions/min_length": 508.0,
      "completions/min_terminated_length": 508.0,
      "epoch": 2.7037037037037037,
      "grad_norm": 1.455819713215063,
      "kl": NaN,
      "learning_rate": 2.2977475282843266e-07,
      "loss": -0.2113,
      "num_tokens": 24623477.0,
      "reward": 0.019184719771146774,
      "reward_std": 0.02016684226691723,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0004830232937820256,
      "rewards/logprob_reward/std": 0.0013851220719516277,
      "step": 876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 681.5,
      "completions/mean_terminated_length": 658.6666870117188,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 2.7067901234567904,
      "grad_norm": 1.8719776724399728,
      "kl": 0.2705078125,
      "learning_rate": 2.292764971394265e-07,
      "loss": -0.2832,
      "num_tokens": 24652041.0,
      "reward": 0.031855180859565735,
      "reward_std": 0.03468414396047592,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0006724251434206963,
      "rewards/logprob_reward/std": 0.001641817856580019,
      "step": 877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1014.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 685.75,
      "completions/mean_terminated_length": 685.75,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 2.7098765432098766,
      "grad_norm": 2.0589673508480195,
      "kl": 0.2218017578125,
      "learning_rate": 2.2877832432291188e-07,
      "loss": -0.1418,
      "num_tokens": 24680525.0,
      "reward": 0.033176496624946594,
      "reward_std": 0.04296921193599701,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002140550408512354,
      "rewards/logprob_reward/std": 0.004232995677739382,
      "step": 878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 724.21875,
      "completions/mean_terminated_length": 668.7037353515625,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 2.712962962962963,
      "grad_norm": 2.119981505262375,
      "kl": 0.23681640625,
      "learning_rate": 2.2828023637106273e-07,
      "loss": -0.1436,
      "num_tokens": 24710444.0,
      "reward": 0.020132753998041153,
      "reward_std": 0.03874822333455086,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001536392024718225,
      "rewards/logprob_reward/std": 0.0035032329615205526,
      "step": 879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 752.5625,
      "completions/mean_terminated_length": 713.7857666015625,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 2.7160493827160495,
      "grad_norm": 3.2460424909186703,
      "kl": 0.2237548828125,
      "learning_rate": 2.2778223527571362e-07,
      "loss": -0.462,
      "num_tokens": 24741258.0,
      "reward": 0.03546038269996643,
      "reward_std": 0.03611791878938675,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0012059778673574328,
      "rewards/logprob_reward/std": 0.002610138850286603,
      "step": 880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 711.5625,
      "completions/mean_terminated_length": 679.2413940429688,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 2.7191358024691357,
      "grad_norm": 1.6212524916976438,
      "kl": 0.27587890625,
      "learning_rate": 2.2728432302835183e-07,
      "loss": -0.0666,
      "num_tokens": 24770404.0,
      "reward": 0.016240714117884636,
      "reward_std": 0.02671341598033905,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0006841267459094524,
      "rewards/logprob_reward/std": 0.0019257022067904472,
      "step": 881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 743.875,
      "completions/mean_terminated_length": 692.0,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 2.7222222222222223,
      "grad_norm": 2.0606874675581333,
      "kl": 0.203125,
      "learning_rate": 2.2678650162010937e-07,
      "loss": -0.1637,
      "num_tokens": 24800956.0,
      "reward": 0.025701280683279037,
      "reward_std": 0.04016866162419319,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0007791995303705335,
      "rewards/logprob_reward/std": 0.0015462575247511268,
      "step": 882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 747.03125,
      "completions/mean_terminated_length": 728.5667114257812,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 2.7253086419753085,
      "grad_norm": 2.542278374391496,
      "kl": 0.2138671875,
      "learning_rate": 2.2628877304175472e-07,
      "loss": -0.1805,
      "num_tokens": 24831345.0,
      "reward": 0.039266541600227356,
      "reward_std": 0.03604422137141228,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0019628223963081837,
      "rewards/logprob_reward/std": 0.0035051226150244474,
      "step": 883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 720.8125,
      "completions/mean_terminated_length": 677.5,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 2.728395061728395,
      "grad_norm": 1.6664721271423677,
      "kl": 0.21124267578125,
      "learning_rate": 2.2579113928368548e-07,
      "loss": -0.0488,
      "num_tokens": 24860695.0,
      "reward": 0.037032563239336014,
      "reward_std": 0.03367120772600174,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0029528478626161814,
      "rewards/logprob_reward/std": 0.005151271354407072,
      "step": 884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 700.625,
      "completions/mean_terminated_length": 679.0667114257812,
      "completions/min_length": 380.0,
      "completions/min_terminated_length": 380.0,
      "epoch": 2.7314814814814814,
      "grad_norm": 2.40551840197583,
      "kl": 0.2275390625,
      "learning_rate": 2.2529360233591997e-07,
      "loss": -0.2004,
      "num_tokens": 24889427.0,
      "reward": 0.0368088036775589,
      "reward_std": 0.04108230024576187,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0027042264118790627,
      "rewards/logprob_reward/std": 0.004673474468290806,
      "step": 885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 835.0,
      "completions/mean_length": 687.625,
      "completions/mean_terminated_length": 625.3333129882812,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 2.734567901234568,
      "grad_norm": 1.9828080778962243,
      "kl": 0.2723388671875,
      "learning_rate": 2.2479616418808915e-07,
      "loss": -0.1018,
      "num_tokens": 24918295.0,
      "reward": 0.041648536920547485,
      "reward_std": 0.04729478806257248,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0046094865538179874,
      "rewards/logprob_reward/std": 0.01932183839380741,
      "step": 886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 729.0,
      "completions/mean_terminated_length": 709.3333740234375,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 2.7376543209876543,
      "grad_norm": 3.843452394623579,
      "kl": 0.23779296875,
      "learning_rate": 2.242988268294292e-07,
      "loss": -0.3611,
      "num_tokens": 24948303.0,
      "reward": 0.038460731506347656,
      "reward_std": 0.04706891253590584,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0010674791410565376,
      "rewards/logprob_reward/std": 0.0017414541216567159,
      "step": 887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 713.0,
      "completions/mean_terminated_length": 668.5714721679688,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 2.7407407407407405,
      "grad_norm": 1.9014996345376378,
      "kl": 0.23095703125,
      "learning_rate": 2.23801592248773e-07,
      "loss": -0.0997,
      "num_tokens": 24977355.0,
      "reward": 0.04267347976565361,
      "reward_std": 0.04828355461359024,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0022760871797800064,
      "rewards/logprob_reward/std": 0.00266113318502903,
      "step": 888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 734.125,
      "completions/mean_terminated_length": 714.800048828125,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 2.743827160493827,
      "grad_norm": 2.6287424716050847,
      "kl": 0.1951904296875,
      "learning_rate": 2.2330446243454265e-07,
      "loss": -0.4275,
      "num_tokens": 25007187.0,
      "reward": 0.04790825769305229,
      "reward_std": 0.05132868513464928,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001148065086454153,
      "rewards/logprob_reward/std": 0.002382042817771435,
      "step": 889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 720.65625,
      "completions/mean_terminated_length": 700.433349609375,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 2.746913580246914,
      "grad_norm": 2.1345033147595203,
      "kl": 0.2271728515625,
      "learning_rate": 2.228074393747412e-07,
      "loss": -0.2181,
      "num_tokens": 25036912.0,
      "reward": 0.03855757787823677,
      "reward_std": 0.04170295223593712,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.00117508287075907,
      "rewards/logprob_reward/std": 0.002204425632953644,
      "step": 890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 752.15625,
      "completions/mean_terminated_length": 724.0344848632812,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 2.75,
      "grad_norm": 2.1455061021486355,
      "kl": 0.222900390625,
      "learning_rate": 2.2231052505694458e-07,
      "loss": -0.0601,
      "num_tokens": 25067697.0,
      "reward": 0.02888527512550354,
      "reward_std": 0.03874707967042923,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0008447513682767749,
      "rewards/logprob_reward/std": 0.0017420414369553328,
      "step": 891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 756.03125,
      "completions/mean_terminated_length": 706.4074096679688,
      "completions/min_length": 524.0,
      "completions/min_terminated_length": 524.0,
      "epoch": 2.753086419753086,
      "grad_norm": 1.9247640801259118,
      "kl": 0.2283935546875,
      "learning_rate": 2.2181372146829418e-07,
      "loss": -0.1753,
      "num_tokens": 25098878.0,
      "reward": 0.0382111594080925,
      "reward_std": 0.040418513119220734,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.000790177786257118,
      "rewards/logprob_reward/std": 0.002555250423029065,
      "step": 892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 743.40625,
      "completions/mean_terminated_length": 724.7000122070312,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 2.756172839506173,
      "grad_norm": 3.2209369553822227,
      "kl": 0.2071533203125,
      "learning_rate": 2.213170305954884e-07,
      "loss": -0.5666,
      "num_tokens": 25129303.0,
      "reward": 0.045907195657491684,
      "reward_std": 0.049174029380083084,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0023968853056430817,
      "rewards/logprob_reward/std": 0.005156919360160828,
      "step": 893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 685.5,
      "completions/mean_terminated_length": 674.5806274414062,
      "completions/min_length": 499.0,
      "completions/min_terminated_length": 499.0,
      "epoch": 2.7592592592592595,
      "grad_norm": 2.136325788844036,
      "kl": 0.22412109375,
      "learning_rate": 2.2082045442477497e-07,
      "loss": -0.0527,
      "num_tokens": 25157715.0,
      "reward": 0.06102164462208748,
      "reward_std": 0.046311501413583755,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0018296046182513237,
      "rewards/logprob_reward/std": 0.003303486853837967,
      "step": 894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 991.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 695.0625,
      "completions/mean_terminated_length": 695.0625,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 2.7623456790123457,
      "grad_norm": 2.0556344749647875,
      "kl": 0.2237548828125,
      "learning_rate": 2.2032399494194292e-07,
      "loss": -0.0831,
      "num_tokens": 25186289.0,
      "reward": 0.041386574506759644,
      "reward_std": 0.04076617956161499,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0008461924735456705,
      "rewards/logprob_reward/std": 0.0014869216829538345,
      "step": 895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 734.375,
      "completions/mean_terminated_length": 704.413818359375,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 2.765432098765432,
      "grad_norm": 2.7409042385817526,
      "kl": 0.2484130859375,
      "learning_rate": 2.1982765413231466e-07,
      "loss": -0.3198,
      "num_tokens": 25216233.0,
      "reward": 0.036085888743400574,
      "reward_std": 0.04144894704222679,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0019009875832125545,
      "rewards/logprob_reward/std": 0.0037570588756352663,
      "step": 896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 746.65625,
      "completions/mean_terminated_length": 695.2963256835938,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 2.7685185185185186,
      "grad_norm": 1.8348602353578563,
      "kl": 0.2242431640625,
      "learning_rate": 2.1933143398073805e-07,
      "loss": -0.0908,
      "num_tokens": 25246846.0,
      "reward": 0.02923869527876377,
      "reward_std": 0.04143529012799263,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.001237439108081162,
      "rewards/logprob_reward/std": 0.002240176545456052,
      "step": 897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1023.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 716.78125,
      "completions/mean_terminated_length": 716.78125,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 2.771604938271605,
      "grad_norm": 1.7030290514235649,
      "kl": 0.2056884765625,
      "learning_rate": 2.1883533647157828e-07,
      "loss": -0.0736,
      "num_tokens": 25276579.0,
      "reward": 0.05580006539821625,
      "reward_std": 0.047913894057273865,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0029722945764660835,
      "rewards/logprob_reward/std": 0.004684407729655504,
      "step": 898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 748.9375,
      "completions/mean_terminated_length": 709.6428833007812,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 2.7746913580246915,
      "grad_norm": 1.980861417160589,
      "kl": 0.213134765625,
      "learning_rate": 2.1833936358871045e-07,
      "loss": -0.238,
      "num_tokens": 25306981.0,
      "reward": 0.04480702057480812,
      "reward_std": 0.045569345355033875,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0011744651710614562,
      "rewards/logprob_reward/std": 0.002723806072026491,
      "step": 899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 705.53125,
      "completions/mean_terminated_length": 672.586181640625,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 2.7777777777777777,
      "grad_norm": 2.1573868648030263,
      "kl": 0.236328125,
      "learning_rate": 2.1784351731551077e-07,
      "loss": -0.0647,
      "num_tokens": 25335726.0,
      "reward": 0.03806355968117714,
      "reward_std": 0.0521421954035759,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0006261759554035962,
      "rewards/logprob_reward/std": 0.0018277488416060805,
      "step": 900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 727.90625,
      "completions/mean_terminated_length": 697.27587890625,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.7808641975308643,
      "grad_norm": 3.639874262202838,
      "kl": 0.2315673828125,
      "learning_rate": 2.1734779963484959e-07,
      "loss": -0.3523,
      "num_tokens": 25366059.0,
      "reward": 0.03266870230436325,
      "reward_std": 0.05293484777212143,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0015763344708830118,
      "rewards/logprob_reward/std": 0.0033819256350398064,
      "step": 901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 744.6875,
      "completions/mean_terminated_length": 704.7857666015625,
      "completions/min_length": 540.0,
      "completions/min_terminated_length": 540.0,
      "epoch": 2.7839506172839505,
      "grad_norm": 2.2011254841985393,
      "kl": 0.223388671875,
      "learning_rate": 2.1685221252908282e-07,
      "loss": -0.3663,
      "num_tokens": 25396109.0,
      "reward": 0.03815517947077751,
      "reward_std": 0.04741198942065239,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0007279774872586131,
      "rewards/logprob_reward/std": 0.0016141906380653381,
      "step": 902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 789.53125,
      "completions/mean_terminated_length": 746.1111450195312,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 2.787037037037037,
      "grad_norm": 1.5647770721778727,
      "kl": 0.2010498046875,
      "learning_rate": 2.163567579800443e-07,
      "loss": 0.0294,
      "num_tokens": 25428246.0,
      "reward": 0.02891172468662262,
      "reward_std": 0.01902131177484989,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0008741386118344963,
      "rewards/logprob_reward/std": 0.002331784460693598,
      "step": 903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 653.4375,
      "completions/mean_terminated_length": 641.4838256835938,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 2.7901234567901234,
      "grad_norm": 1.7748957652190511,
      "kl": 0.2374267578125,
      "learning_rate": 2.1586143796903775e-07,
      "loss": -0.1707,
      "num_tokens": 25455468.0,
      "reward": 0.04110602289438248,
      "reward_std": 0.04065663367509842,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0005344680976122618,
      "rewards/logprob_reward/std": 0.0008938212413340807,
      "step": 904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 684.09375,
      "completions/mean_terminated_length": 673.1290283203125,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 2.7932098765432096,
      "grad_norm": 2.0471774127617355,
      "kl": 0.234375,
      "learning_rate": 2.1536625447682877e-07,
      "loss": -0.1063,
      "num_tokens": 25483795.0,
      "reward": 0.02925974503159523,
      "reward_std": 0.034787438809871674,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0012608272954821587,
      "rewards/logprob_reward/std": 0.002463593613356352,
      "step": 905
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 690.53125,
      "completions/mean_terminated_length": 656.0344848632812,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 2.7962962962962963,
      "grad_norm": 2.0512353520988977,
      "kl": NaN,
      "learning_rate": 2.1487120948363713e-07,
      "loss": -0.06,
      "num_tokens": 25512088.0,
      "reward": 0.023405537009239197,
      "reward_std": 0.0352381132543087,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0017005959525704384,
      "rewards/logprob_reward/std": 0.006216020323336124,
      "step": 906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 708.15625,
      "completions/mean_terminated_length": 687.1000366210938,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 2.799382716049383,
      "grad_norm": 1.8893150159550898,
      "kl": 0.24169921875,
      "learning_rate": 2.1437630496912889e-07,
      "loss": -0.1146,
      "num_tokens": 25540801.0,
      "reward": 0.04442692548036575,
      "reward_std": 0.034866563975811005,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0007521397201344371,
      "rewards/logprob_reward/std": 0.001434684731066227,
      "step": 907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 721.09375,
      "completions/mean_terminated_length": 700.9000244140625,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 2.802469135802469,
      "grad_norm": 2.020542494468501,
      "kl": 0.2286376953125,
      "learning_rate": 2.1388154291240794e-07,
      "loss": -0.2745,
      "num_tokens": 25569920.0,
      "reward": 0.05134275183081627,
      "reward_std": 0.05257630720734596,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0014919439563527703,
      "rewards/logprob_reward/std": 0.0018537379801273346,
      "step": 908
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 707.40625,
      "completions/mean_terminated_length": 674.6551513671875,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 2.8055555555555554,
      "grad_norm": 1.8797137190535893,
      "kl": NaN,
      "learning_rate": 2.133869252920089e-07,
      "loss": -0.1527,
      "num_tokens": 25599417.0,
      "reward": 0.03823522478342056,
      "reward_std": 0.04038812220096588,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0008169173379428685,
      "rewards/logprob_reward/std": 0.0017048893496394157,
      "step": 909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 727.09375,
      "completions/mean_terminated_length": 696.3793334960938,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 2.808641975308642,
      "grad_norm": 1.8310669285933219,
      "kl": 0.2655029296875,
      "learning_rate": 2.128924540858885e-07,
      "loss": -0.0971,
      "num_tokens": 25629592.0,
      "reward": 0.019925329834222794,
      "reward_std": 0.033746667206287384,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001305922050960362,
      "rewards/logprob_reward/std": 0.002956134732812643,
      "step": 910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 701.96875,
      "completions/mean_terminated_length": 691.5806274414062,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 2.8117283950617287,
      "grad_norm": 2.176586044774021,
      "kl": 0.244873046875,
      "learning_rate": 2.1239813127141828e-07,
      "loss": -0.1411,
      "num_tokens": 25658383.0,
      "reward": 0.022919094190001488,
      "reward_std": 0.039000414311885834,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.001160103245638311,
      "rewards/logprob_reward/std": 0.0021208145190030336,
      "step": 911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 740.8125,
      "completions/mean_terminated_length": 700.357177734375,
      "completions/min_length": 535.0,
      "completions/min_terminated_length": 535.0,
      "epoch": 2.814814814814815,
      "grad_norm": 2.4825655831882703,
      "kl": 0.2076416015625,
      "learning_rate": 2.1190395882537598e-07,
      "loss": -0.1728,
      "num_tokens": 25688761.0,
      "reward": 0.03592553734779358,
      "reward_std": 0.043933987617492676,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0017228213837370276,
      "rewards/logprob_reward/std": 0.0035046476405113935,
      "step": 912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 778.71875,
      "completions/mean_terminated_length": 753.3448486328125,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 2.817901234567901,
      "grad_norm": 1.8471597814389293,
      "kl": 0.1898193359375,
      "learning_rate": 2.1140993872393833e-07,
      "loss": -0.0453,
      "num_tokens": 25720608.0,
      "reward": 0.03514767065644264,
      "reward_std": 0.0339820571243763,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0008585240575484931,
      "rewards/logprob_reward/std": 0.002222515409812331,
      "step": 913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 770.125,
      "completions/mean_terminated_length": 723.1111450195312,
      "completions/min_length": 618.0,
      "completions/min_terminated_length": 618.0,
      "epoch": 2.8209876543209877,
      "grad_norm": 1.4505306512388436,
      "kl": 0.2109375,
      "learning_rate": 2.1091607294267269e-07,
      "loss": -0.0187,
      "num_tokens": 25751912.0,
      "reward": 0.020330917090177536,
      "reward_std": 0.02412344701588154,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0017565754242241383,
      "rewards/logprob_reward/std": 0.004126360174268484,
      "step": 914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 823.0,
      "completions/mean_length": 644.8125,
      "completions/mean_terminated_length": 619.5333862304688,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.824074074074074,
      "grad_norm": 2.6444669946440027,
      "kl": 0.23681640625,
      "learning_rate": 2.1042236345652947e-07,
      "loss": -0.2378,
      "num_tokens": 25778326.0,
      "reward": 0.032222680747509,
      "reward_std": 0.026036633178591728,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0010807574726641178,
      "rewards/logprob_reward/std": 0.0017917260993272066,
      "step": 915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 720.8125,
      "completions/mean_terminated_length": 700.6000366210938,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.8271604938271606,
      "grad_norm": 2.3359892011606327,
      "kl": 0.2286376953125,
      "learning_rate": 2.0992881223983368e-07,
      "loss": -0.1425,
      "num_tokens": 25808068.0,
      "reward": 0.03367864713072777,
      "reward_std": 0.039221081882715225,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0026984962169080973,
      "rewards/logprob_reward/std": 0.0053538489155471325,
      "step": 916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 743.4375,
      "completions/mean_terminated_length": 703.357177734375,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 2.830246913580247,
      "grad_norm": 1.8391771393551033,
      "kl": 0.247802734375,
      "learning_rate": 2.0943542126627784e-07,
      "loss": -0.1331,
      "num_tokens": 25838754.0,
      "reward": 0.03146309033036232,
      "reward_std": 0.045010365545749664,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.00023676696582697332,
      "rewards/logprob_reward/std": 0.0008217405993491411,
      "step": 917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 783.78125,
      "completions/mean_terminated_length": 749.4642944335938,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 2.8333333333333335,
      "grad_norm": 1.8408733626005627,
      "kl": 0.1705322265625,
      "learning_rate": 2.0894219250891352e-07,
      "loss": -0.1964,
      "num_tokens": 25870683.0,
      "reward": 0.04172273352742195,
      "reward_std": 0.04826078563928604,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0012197005562484264,
      "rewards/logprob_reward/std": 0.0017729178071022034,
      "step": 918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 694.75,
      "completions/mean_terminated_length": 672.800048828125,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 2.8364197530864197,
      "grad_norm": 2.1710575076842136,
      "kl": 0.2451171875,
      "learning_rate": 2.0844912794014341e-07,
      "loss": -0.2082,
      "num_tokens": 25899011.0,
      "reward": 0.019306883215904236,
      "reward_std": 0.02787940949201584,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0006187591934576631,
      "rewards/logprob_reward/std": 0.0014984877780079842,
      "step": 919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 746.25,
      "completions/mean_terminated_length": 727.7333984375,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 2.8395061728395063,
      "grad_norm": 2.0881691914331175,
      "kl": 0.2122802734375,
      "learning_rate": 2.079562295317139e-07,
      "loss": -0.137,
      "num_tokens": 25929343.0,
      "reward": 0.02632749453186989,
      "reward_std": 0.03549928218126297,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0014749924885109067,
      "rewards/logprob_reward/std": 0.0022526613902300596,
      "step": 920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 685.4375,
      "completions/mean_terminated_length": 674.51611328125,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 2.8425925925925926,
      "grad_norm": 2.000703787788389,
      "kl": 0.2445068359375,
      "learning_rate": 2.0746349925470672e-07,
      "loss": -0.072,
      "num_tokens": 25957837.0,
      "reward": 0.0353454053401947,
      "reward_std": 0.040932249277830124,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.001078227418474853,
      "rewards/logprob_reward/std": 0.0023284656926989555,
      "step": 921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 678.03125,
      "completions/mean_terminated_length": 654.9666748046875,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 2.8456790123456788,
      "grad_norm": 2.779900339886693,
      "kl": 0.2435302734375,
      "learning_rate": 2.0697093907953134e-07,
      "loss": -0.2733,
      "num_tokens": 25986354.0,
      "reward": 0.02944917231798172,
      "reward_std": 0.04230036959052086,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0014713001437485218,
      "rewards/logprob_reward/std": 0.0037846944760531187,
      "step": 922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 784.0,
      "completions/mean_length": 648.78125,
      "completions/mean_terminated_length": 623.7667236328125,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 2.8487654320987654,
      "grad_norm": 2.106664401008419,
      "kl": 0.24365234375,
      "learning_rate": 2.0647855097591704e-07,
      "loss": -0.2234,
      "num_tokens": 26013543.0,
      "reward": 0.03965634107589722,
      "reward_std": 0.05002221092581749,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002395935822278261,
      "rewards/logprob_reward/std": 0.0039562019519507885,
      "step": 923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 680.96875,
      "completions/mean_terminated_length": 658.1000366210938,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 2.851851851851852,
      "grad_norm": 2.096405505444134,
      "kl": 0.2164306640625,
      "learning_rate": 2.0598633691290485e-07,
      "loss": -0.1971,
      "num_tokens": 26042054.0,
      "reward": 0.04500027745962143,
      "reward_std": 0.056555259972810745,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0013891983544453979,
      "rewards/logprob_reward/std": 0.0026214104145765305,
      "step": 924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 671.59375,
      "completions/mean_terminated_length": 635.137939453125,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 2.8549382716049383,
      "grad_norm": 1.8687028382292379,
      "kl": 0.2330322265625,
      "learning_rate": 2.054942988588399e-07,
      "loss": -0.059,
      "num_tokens": 26070213.0,
      "reward": 0.03630334883928299,
      "reward_std": 0.027717553079128265,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0021426090970635414,
      "rewards/logprob_reward/std": 0.003982920199632645,
      "step": 925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 740.9375,
      "completions/mean_terminated_length": 731.8064575195312,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 2.8580246913580245,
      "grad_norm": 1.6972618165454312,
      "kl": 0.237548828125,
      "learning_rate": 2.050024387813634e-07,
      "loss": -0.2142,
      "num_tokens": 26100399.0,
      "reward": 0.03215242922306061,
      "reward_std": 0.03464379906654358,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0010027001844719052,
      "rewards/logprob_reward/std": 0.002128482097759843,
      "step": 926
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 753.71875,
      "completions/mean_terminated_length": 735.7000122070312,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 2.861111111111111,
      "grad_norm": 1.7761275671751802,
      "kl": NaN,
      "learning_rate": 2.0451075864740496e-07,
      "loss": -0.1009,
      "num_tokens": 26130846.0,
      "reward": 0.029843822121620178,
      "reward_std": 0.035880472511053085,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0019098015036433935,
      "rewards/logprob_reward/std": 0.0037320053670555353,
      "step": 927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 747.5625,
      "completions/mean_terminated_length": 718.9655151367188,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 2.8641975308641974,
      "grad_norm": 1.7724801359206694,
      "kl": 0.217529296875,
      "learning_rate": 2.0401926042317455e-07,
      "loss": -0.1598,
      "num_tokens": 26161596.0,
      "reward": 0.032547809183597565,
      "reward_std": 0.027523929253220558,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0014420116785913706,
      "rewards/logprob_reward/std": 0.002904426772147417,
      "step": 928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 734.15625,
      "completions/mean_terminated_length": 692.7500610351562,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 2.867283950617284,
      "grad_norm": 1.7664998792321178,
      "kl": 0.225830078125,
      "learning_rate": 2.0352794607415465e-07,
      "loss": -0.0817,
      "num_tokens": 26191673.0,
      "reward": 0.04807525873184204,
      "reward_std": 0.04088424891233444,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0013336197007447481,
      "rewards/logprob_reward/std": 0.0023917900398373604,
      "step": 929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 659.1875,
      "completions/mean_terminated_length": 647.4193115234375,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 2.8703703703703702,
      "grad_norm": 2.7436686524222655,
      "kl": 0.2562255859375,
      "learning_rate": 2.0303681756509254e-07,
      "loss": -0.2881,
      "num_tokens": 26219299.0,
      "reward": 0.0265082735568285,
      "reward_std": 0.036000318825244904,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0016758590936660767,
      "rewards/logprob_reward/std": 0.004514663480222225,
      "step": 930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 692.4375,
      "completions/mean_terminated_length": 670.3333740234375,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 2.873456790123457,
      "grad_norm": 2.264409669674971,
      "kl": 0.2442626953125,
      "learning_rate": 2.0254587685999215e-07,
      "loss": -0.1219,
      "num_tokens": 26248205.0,
      "reward": 0.04572978615760803,
      "reward_std": 0.03910953179001808,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0021997597068548203,
      "rewards/logprob_reward/std": 0.003756813472136855,
      "step": 931
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 736.875,
      "completions/mean_terminated_length": 727.6128540039062,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 2.876543209876543,
      "grad_norm": 1.8927253489778453,
      "kl": 0.231689453125,
      "learning_rate": 2.020551259221066e-07,
      "loss": -0.0791,
      "num_tokens": 26277901.0,
      "reward": 0.050919897854328156,
      "reward_std": 0.04152818024158478,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0010221096454188228,
      "rewards/logprob_reward/std": 0.00207647611387074,
      "step": 932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 700.75,
      "completions/mean_terminated_length": 654.5714721679688,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 2.8796296296296298,
      "grad_norm": 2.6095332054392095,
      "kl": 0.22802734375,
      "learning_rate": 2.0156456671392988e-07,
      "loss": -0.3138,
      "num_tokens": 26306925.0,
      "reward": 0.026051906868815422,
      "reward_std": 0.03594221919775009,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0011687844526022673,
      "rewards/logprob_reward/std": 0.0023048915900290012,
      "step": 933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 738.71875,
      "completions/mean_terminated_length": 709.2069091796875,
      "completions/min_length": 498.0,
      "completions/min_terminated_length": 498.0,
      "epoch": 2.882716049382716,
      "grad_norm": 2.3273980833688,
      "kl": 0.2119140625,
      "learning_rate": 2.010742011971895e-07,
      "loss": -0.1946,
      "num_tokens": 26337436.0,
      "reward": 0.01686321385204792,
      "reward_std": 0.032536305487155914,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0013757927808910608,
      "rewards/logprob_reward/std": 0.002666281070560217,
      "step": 934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 910.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 683.15625,
      "completions/mean_terminated_length": 683.15625,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 2.8858024691358026,
      "grad_norm": 4.617620683870493,
      "kl": 0.2593994140625,
      "learning_rate": 2.005840313328383e-07,
      "loss": -0.2423,
      "num_tokens": 26365493.0,
      "reward": 0.02862159162759781,
      "reward_std": 0.032037027180194855,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0005517672398127615,
      "rewards/logprob_reward/std": 0.001306802616454661,
      "step": 935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 994.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 632.53125,
      "completions/mean_terminated_length": 632.53125,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 2.888888888888889,
      "grad_norm": 3.2772853980728667,
      "kl": 0.247314453125,
      "learning_rate": 2.0009405908104673e-07,
      "loss": -0.3443,
      "num_tokens": 26391638.0,
      "reward": 0.047143395990133286,
      "reward_std": 0.045782119035720825,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0002982149017043412,
      "rewards/logprob_reward/std": 0.0007833261624909937,
      "step": 936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 763.75,
      "completions/mean_terminated_length": 746.4000244140625,
      "completions/min_length": 503.0,
      "completions/min_terminated_length": 503.0,
      "epoch": 2.8919753086419755,
      "grad_norm": 2.43808565296315,
      "kl": 0.20361328125,
      "learning_rate": 1.996042864011951e-07,
      "loss": -0.2154,
      "num_tokens": 26422678.0,
      "reward": 0.025890544056892395,
      "reward_std": 0.0403691790997982,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0009894927497953176,
      "rewards/logprob_reward/std": 0.0026666016783565283,
      "step": 937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 673.53125,
      "completions/mean_terminated_length": 623.4642944335938,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 2.8950617283950617,
      "grad_norm": 2.351329091261145,
      "kl": 0.24432373046875,
      "learning_rate": 1.9911471525186534e-07,
      "loss": -0.1476,
      "num_tokens": 26450139.0,
      "reward": 0.03514251857995987,
      "reward_std": 0.03566879779100418,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0008527971804141998,
      "rewards/logprob_reward/std": 0.0015333736082538962,
      "step": 938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1018.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 718.5625,
      "completions/mean_terminated_length": 718.5625,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.898148148148148,
      "grad_norm": 2.3260060500535107,
      "kl": 0.2093505859375,
      "learning_rate": 1.9862534759083379e-07,
      "loss": -0.2907,
      "num_tokens": 26479697.0,
      "reward": 0.03334007412195206,
      "reward_std": 0.04166783392429352,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0023223059251904488,
      "rewards/logprob_reward/std": 0.004669366870075464,
      "step": 939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 687.46875,
      "completions/mean_terminated_length": 687.46875,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.9012345679012346,
      "grad_norm": 2.299405399407316,
      "kl": 0.2261962890625,
      "learning_rate": 1.9813618537506302e-07,
      "loss": -0.1793,
      "num_tokens": 26508116.0,
      "reward": 0.038836508989334106,
      "reward_std": 0.04531483352184296,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.001485014334321022,
      "rewards/logprob_reward/std": 0.0030897411052137613,
      "step": 940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 695.40625,
      "completions/mean_terminated_length": 673.5000610351562,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 2.9043209876543212,
      "grad_norm": 1.9386965966121879,
      "kl": 0.218017578125,
      "learning_rate": 1.9764723056069365e-07,
      "loss": -0.1481,
      "num_tokens": 26536901.0,
      "reward": 0.026116058230400085,
      "reward_std": 0.02804435044527054,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0012400643900036812,
      "rewards/logprob_reward/std": 0.0018863547593355179,
      "step": 941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 694.75,
      "completions/mean_terminated_length": 672.800048828125,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 2.9074074074074074,
      "grad_norm": 2.2908048302294777,
      "kl": 0.231201171875,
      "learning_rate": 1.9715848510303739e-07,
      "loss": -0.2014,
      "num_tokens": 26565441.0,
      "reward": 0.04806693643331528,
      "reward_std": 0.04172935336828232,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001324373995885253,
      "rewards/logprob_reward/std": 0.002286948962137103,
      "step": 942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 668.8125,
      "completions/mean_terminated_length": 657.3547973632812,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 2.9104938271604937,
      "grad_norm": 2.797264602347024,
      "kl": 0.22705078125,
      "learning_rate": 1.966699509565685e-07,
      "loss": -0.2226,
      "num_tokens": 26592935.0,
      "reward": 0.03517685830593109,
      "reward_std": 0.04654119908809662,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0008909569587558508,
      "rewards/logprob_reward/std": 0.0019737931434065104,
      "step": 943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 724.46875,
      "completions/mean_terminated_length": 655.34619140625,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 2.9135802469135803,
      "grad_norm": 2.2749502960424355,
      "kl": 0.25048828125,
      "learning_rate": 1.961816300749163e-07,
      "loss": -0.1714,
      "num_tokens": 26622870.0,
      "reward": 0.0398334376513958,
      "reward_std": 0.045474015176296234,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002592710079625249,
      "rewards/logprob_reward/std": 0.005250102840363979,
      "step": 944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 650.375,
      "completions/mean_terminated_length": 638.3225708007812,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 2.9166666666666665,
      "grad_norm": 2.321613512476675,
      "kl": 0.2291259765625,
      "learning_rate": 1.9569352441085712e-07,
      "loss": -0.1206,
      "num_tokens": 26650066.0,
      "reward": 0.054110296070575714,
      "reward_std": 0.027386318892240524,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0010947741102427244,
      "rewards/logprob_reward/std": 0.002825903706252575,
      "step": 945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 733.0,
      "completions/mean_terminated_length": 713.6000366210938,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 2.919753086419753,
      "grad_norm": 2.2568177731349293,
      "kl": 0.21337890625,
      "learning_rate": 1.9520563591630686e-07,
      "loss": -0.179,
      "num_tokens": 26679926.0,
      "reward": 0.04479778930544853,
      "reward_std": 0.04891182482242584,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0011642095632851124,
      "rewards/logprob_reward/std": 0.002152997301891446,
      "step": 946
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 747.5,
      "completions/mean_terminated_length": 696.2963256835938,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 2.9228395061728394,
      "grad_norm": 2.7880005014153393,
      "kl": NaN,
      "learning_rate": 1.9471796654231278e-07,
      "loss": -0.2831,
      "num_tokens": 26710714.0,
      "reward": 0.022786159068346024,
      "reward_std": 0.02745286375284195,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0010123990941792727,
      "rewards/logprob_reward/std": 0.003126228228211403,
      "step": 947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 859.0,
      "completions/mean_length": 647.21875,
      "completions/mean_terminated_length": 622.1000366210938,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 2.925925925925926,
      "grad_norm": 2.3418864750573807,
      "kl": 0.2344970703125,
      "learning_rate": 1.9423051823904602e-07,
      "loss": -0.1891,
      "num_tokens": 26738029.0,
      "reward": 0.03349651023745537,
      "reward_std": 0.045034341514110565,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002496121684089303,
      "rewards/logprob_reward/std": 0.005302882753312588,
      "step": 948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 736.125,
      "completions/mean_terminated_length": 716.933349609375,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 2.9290123456790123,
      "grad_norm": 2.5584406905157295,
      "kl": 0.2247314453125,
      "learning_rate": 1.9374329295579372e-07,
      "loss": -0.0229,
      "num_tokens": 26768945.0,
      "reward": 0.03358815610408783,
      "reward_std": 0.033007651567459106,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0025979499332606792,
      "rewards/logprob_reward/std": 0.00609669741243124,
      "step": 949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 694.34375,
      "completions/mean_terminated_length": 672.36669921875,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 2.932098765432099,
      "grad_norm": 2.0999638328254715,
      "kl": 0.22021484375,
      "learning_rate": 1.9325629264095083e-07,
      "loss": -0.0932,
      "num_tokens": 26797632.0,
      "reward": 0.0319441556930542,
      "reward_std": 0.039987243711948395,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0007712829392403364,
      "rewards/logprob_reward/std": 0.001864199643023312,
      "step": 950
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 741.0,
      "completions/mean_terminated_length": 700.5714721679688,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 2.935185185185185,
      "grad_norm": 2.85673139460347,
      "kl": NaN,
      "learning_rate": 1.9276951924201304e-07,
      "loss": -0.3032,
      "num_tokens": 26828492.0,
      "reward": 0.03654161095619202,
      "reward_std": 0.022315043956041336,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.002407343592494726,
      "rewards/logprob_reward/std": 0.003987497184425592,
      "step": 951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 929.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 692.25,
      "completions/mean_terminated_length": 692.25,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 2.9382716049382713,
      "grad_norm": 2.587864809945036,
      "kl": 0.2215576171875,
      "learning_rate": 1.922829747055684e-07,
      "loss": -0.3074,
      "num_tokens": 26857088.0,
      "reward": 0.04753436893224716,
      "reward_std": 0.04605057090520859,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.004204855766147375,
      "rewards/logprob_reward/std": 0.008981630206108093,
      "step": 952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 872.0,
      "completions/max_terminated_length": 872.0,
      "completions/mean_length": 655.125,
      "completions/mean_terminated_length": 655.125,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 2.941358024691358,
      "grad_norm": 2.2185380658297977,
      "kl": 0.2635498046875,
      "learning_rate": 1.9179666097728982e-07,
      "loss": -0.1516,
      "num_tokens": 26884268.0,
      "reward": 0.02021958865225315,
      "reward_std": 0.03220298886299133,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.001632875413633883,
      "rewards/logprob_reward/std": 0.0028763532172888517,
      "step": 953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 663.875,
      "completions/mean_terminated_length": 652.258056640625,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 2.9444444444444446,
      "grad_norm": 2.514627285805861,
      "kl": 0.2249755859375,
      "learning_rate": 1.9131058000192726e-07,
      "loss": -0.1363,
      "num_tokens": 26911888.0,
      "reward": 0.044653862714767456,
      "reward_std": 0.0406486913561821,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0010042909998446703,
      "rewards/logprob_reward/std": 0.00209109578281641,
      "step": 954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 658.3125,
      "completions/mean_terminated_length": 646.51611328125,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 2.947530864197531,
      "grad_norm": 2.402558732506387,
      "kl": 0.22216796875,
      "learning_rate": 1.9082473372329983e-07,
      "loss": -0.1917,
      "num_tokens": 26939318.0,
      "reward": 0.035541120916604996,
      "reward_std": 0.02778727188706398,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0012956904247403145,
      "rewards/logprob_reward/std": 0.0030140208546072245,
      "step": 955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 906.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 624.5,
      "completions/mean_terminated_length": 624.5,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 2.950617283950617,
      "grad_norm": 2.3041716856268715,
      "kl": 0.2430419921875,
      "learning_rate": 1.903391240842882e-07,
      "loss": -0.352,
      "num_tokens": 26965298.0,
      "reward": 0.04536670073866844,
      "reward_std": 0.04791241139173508,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017963348655030131,
      "rewards/logprob_reward/std": 0.0029619657434523106,
      "step": 956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 740.0,
      "completions/mean_terminated_length": 710.6206665039062,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 2.9537037037037037,
      "grad_norm": 2.511745953771838,
      "kl": 0.20751953125,
      "learning_rate": 1.8985375302682654e-07,
      "loss": -0.2179,
      "num_tokens": 26995682.0,
      "reward": 0.034126099199056625,
      "reward_std": 0.04123295471072197,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.003195664379745722,
      "rewards/logprob_reward/std": 0.008202649652957916,
      "step": 957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1013.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 638.4375,
      "completions/mean_terminated_length": 638.4375,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 2.9567901234567904,
      "grad_norm": 2.0066421954219074,
      "kl": 0.2579345703125,
      "learning_rate": 1.8936862249189515e-07,
      "loss": -0.2235,
      "num_tokens": 27022396.0,
      "reward": 0.036057278513908386,
      "reward_std": 0.03543171286582947,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.001869198982603848,
      "rewards/logprob_reward/std": 0.00807044468820095,
      "step": 958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 726.6875,
      "completions/mean_terminated_length": 706.86669921875,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 2.9598765432098766,
      "grad_norm": 2.5928406908012582,
      "kl": 0.210205078125,
      "learning_rate": 1.8888373441951228e-07,
      "loss": -0.2476,
      "num_tokens": 27052470.0,
      "reward": 0.016658205538988113,
      "reward_std": 0.02684372290968895,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0011480040848255157,
      "rewards/logprob_reward/std": 0.0028190447483211756,
      "step": 959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 733.375,
      "completions/mean_terminated_length": 714.0000610351562,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 2.962962962962963,
      "grad_norm": 2.2541048802611576,
      "kl": 0.21533203125,
      "learning_rate": 1.8839909074872675e-07,
      "loss": -0.2485,
      "num_tokens": 27082594.0,
      "reward": 0.03550969064235687,
      "reward_std": 0.04626007378101349,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0012607639655470848,
      "rewards/logprob_reward/std": 0.0024260045029222965,
      "step": 960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 702.96875,
      "completions/mean_terminated_length": 681.5667114257812,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 2.9660493827160495,
      "grad_norm": 1.8861370870638634,
      "kl": 0.2171630859375,
      "learning_rate": 1.8791469341761e-07,
      "loss": -0.137,
      "num_tokens": 27111269.0,
      "reward": 0.04205770790576935,
      "reward_std": 0.041784629225730896,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015919003635644913,
      "rewards/logprob_reward/std": 0.0029006951954215765,
      "step": 961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 987.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 694.3125,
      "completions/mean_terminated_length": 694.3125,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 2.9691358024691357,
      "grad_norm": 2.4936754152100336,
      "kl": 0.2056884765625,
      "learning_rate": 1.8743054436324835e-07,
      "loss": -0.4614,
      "num_tokens": 27140059.0,
      "reward": 0.03833600506186485,
      "reward_std": 0.047480180859565735,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0009288942092098296,
      "rewards/logprob_reward/std": 0.0017160874558612704,
      "step": 962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 971.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 686.21875,
      "completions/mean_terminated_length": 686.21875,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 2.9722222222222223,
      "grad_norm": 2.1449705143275857,
      "kl": 0.21044921875,
      "learning_rate": 1.8694664552173529e-07,
      "loss": -0.0667,
      "num_tokens": 27168474.0,
      "reward": 0.0475720576941967,
      "reward_std": 0.03860652074217796,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0007745065959170461,
      "rewards/logprob_reward/std": 0.0016013880958780646,
      "step": 963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 712.625,
      "completions/mean_terminated_length": 680.413818359375,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 2.9753086419753085,
      "grad_norm": 3.0540231894763217,
      "kl": 0.2491455078125,
      "learning_rate": 1.8646299882816358e-07,
      "loss": -0.2756,
      "num_tokens": 27197718.0,
      "reward": 0.04453630745410919,
      "reward_std": 0.04714227467775345,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0008736720774322748,
      "rewards/logprob_reward/std": 0.0018073354149237275,
      "step": 964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 957.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 679.3125,
      "completions/mean_terminated_length": 679.3125,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 2.978395061728395,
      "grad_norm": 2.0565833101266557,
      "kl": 0.213623046875,
      "learning_rate": 1.859796062166178e-07,
      "loss": -0.081,
      "num_tokens": 27226264.0,
      "reward": 0.046038296073675156,
      "reward_std": 0.03503313288092613,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002542553935199976,
      "rewards/logprob_reward/std": 0.004069070797413588,
      "step": 965
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 752.90625,
      "completions/mean_terminated_length": 724.862060546875,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 2.9814814814814814,
      "grad_norm": 1.7593721687065136,
      "kl": NaN,
      "learning_rate": 1.854964696201666e-07,
      "loss": -0.0906,
      "num_tokens": 27257273.0,
      "reward": 0.025743938982486725,
      "reward_std": 0.03171555697917938,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0008265995420515537,
      "rewards/logprob_reward/std": 0.0023957984521985054,
      "step": 966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 921.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 655.25,
      "completions/mean_terminated_length": 655.25,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 2.984567901234568,
      "grad_norm": 1.8427884565955073,
      "kl": 0.2392578125,
      "learning_rate": 1.850135909708544e-07,
      "loss": -0.0342,
      "num_tokens": 27284561.0,
      "reward": 0.04149714112281799,
      "reward_std": 0.046559251844882965,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0009690446313470602,
      "rewards/logprob_reward/std": 0.002075807424262166,
      "step": 967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1017.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 639.34375,
      "completions/mean_terminated_length": 639.34375,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 2.9876543209876543,
      "grad_norm": 2.3890344483833665,
      "kl": 0.2347412109375,
      "learning_rate": 1.8453097219969448e-07,
      "loss": -0.1873,
      "num_tokens": 27311436.0,
      "reward": 0.042143747210502625,
      "reward_std": 0.04867733269929886,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0016874989960342646,
      "rewards/logprob_reward/std": 0.0028819881845265627,
      "step": 968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 701.59375,
      "completions/mean_terminated_length": 691.1935424804688,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 2.9907407407407405,
      "grad_norm": 2.2579359892038533,
      "kl": 0.2457275390625,
      "learning_rate": 1.8404861523666073e-07,
      "loss": -0.1065,
      "num_tokens": 27340667.0,
      "reward": 0.032514940947294235,
      "reward_std": 0.04773382470011711,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0014054876519367099,
      "rewards/logprob_reward/std": 0.0029887219425290823,
      "step": 969
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 641.875,
      "completions/mean_terminated_length": 629.54833984375,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 2.993827160493827,
      "grad_norm": 2.759877169084418,
      "kl": 0.2598876953125,
      "learning_rate": 1.8356652201068024e-07,
      "loss": -0.2884,
      "num_tokens": 27367331.0,
      "reward": 0.04795064032077789,
      "reward_std": 0.03241584450006485,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001195153803564608,
      "rewards/logprob_reward/std": 0.0020508323796093464,
      "step": 970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 690.9375,
      "completions/mean_terminated_length": 690.9375,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 2.996913580246914,
      "grad_norm": 2.0833704877638564,
      "kl": 0.216796875,
      "learning_rate": 1.830846944496251e-07,
      "loss": -0.1624,
      "num_tokens": 27395669.0,
      "reward": 0.04163329675793648,
      "reward_std": 0.03980160504579544,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0011203312315046787,
      "rewards/logprob_reward/std": 0.0018936976557597518,
      "step": 971
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 757.625,
      "completions/mean_terminated_length": 708.2963256835938,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.0,
      "grad_norm": 1.676104430337265,
      "kl": 0.2392578125,
      "learning_rate": 1.826031344803053e-07,
      "loss": -0.0363,
      "num_tokens": 27426409.0,
      "reward": 0.03207068517804146,
      "reward_std": 0.04026733711361885,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0009118721354752779,
      "rewards/logprob_reward/std": 0.0016733306692913175,
      "step": 972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 727.25,
      "completions/mean_terminated_length": 684.857177734375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 3.003086419753086,
      "grad_norm": 2.085222805406508,
      "kl": 0.219482421875,
      "learning_rate": 1.8212184402846064e-07,
      "loss": -0.0639,
      "num_tokens": 27456689.0,
      "reward": 0.02974565513432026,
      "reward_std": 0.03855707868933678,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0018007296603173018,
      "rewards/logprob_reward/std": 0.00419599749147892,
      "step": 973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 725.6875,
      "completions/mean_terminated_length": 683.0714721679688,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 3.006172839506173,
      "grad_norm": 2.1373416146080815,
      "kl": 0.2381591796875,
      "learning_rate": 1.8164082501875326e-07,
      "loss": -0.1113,
      "num_tokens": 27486407.0,
      "reward": 0.01286717876791954,
      "reward_std": 0.025322534143924713,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.0004079767386429012,
      "rewards/logprob_reward/std": 0.0014256259892135859,
      "step": 974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 980.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 692.15625,
      "completions/mean_terminated_length": 692.15625,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.009259259259259,
      "grad_norm": 2.240096866392538,
      "kl": 0.224609375,
      "learning_rate": 1.8116007937475947e-07,
      "loss": -0.1386,
      "num_tokens": 27515024.0,
      "reward": 0.04009921848773956,
      "reward_std": 0.05619942396879196,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002888019662350416,
      "rewards/logprob_reward/std": 0.008251198567450047,
      "step": 975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 688.0625,
      "completions/mean_terminated_length": 677.2257690429688,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 3.0123456790123457,
      "grad_norm": 2.0895995813476627,
      "kl": 0.21240234375,
      "learning_rate": 1.8067960901896278e-07,
      "loss": -0.2006,
      "num_tokens": 27543946.0,
      "reward": 0.02907356433570385,
      "reward_std": 0.03428079932928085,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0010539607610553503,
      "rewards/logprob_reward/std": 0.002084154635667801,
      "step": 976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 711.6875,
      "completions/mean_terminated_length": 701.6128540039062,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 3.015432098765432,
      "grad_norm": 2.002426319069105,
      "kl": 0.2469482421875,
      "learning_rate": 1.8019941587274565e-07,
      "loss": -0.2033,
      "num_tokens": 27573084.0,
      "reward": 0.03858362138271332,
      "reward_std": 0.04214697331190109,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0012040241854265332,
      "rewards/logprob_reward/std": 0.001954335253685713,
      "step": 977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 989.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 648.125,
      "completions/mean_terminated_length": 648.125,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 3.0185185185185186,
      "grad_norm": 2.352263155111765,
      "kl": 0.231201171875,
      "learning_rate": 1.7971950185638195e-07,
      "loss": -0.1319,
      "num_tokens": 27599792.0,
      "reward": 0.0429069846868515,
      "reward_std": 0.03892037644982338,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0025355410762131214,
      "rewards/logprob_reward/std": 0.0033677336759865284,
      "step": 978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 772.90625,
      "completions/mean_terminated_length": 714.9615478515625,
      "completions/min_length": 532.0,
      "completions/min_terminated_length": 532.0,
      "epoch": 3.021604938271605,
      "grad_norm": 2.8889595705531117,
      "kl": 0.206298828125,
      "learning_rate": 1.7923986888902948e-07,
      "loss": -0.1802,
      "num_tokens": 27630777.0,
      "reward": 0.03198745846748352,
      "reward_std": 0.03233847767114639,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.000819399137981236,
      "rewards/logprob_reward/std": 0.0019720583222806454,
      "step": 979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 716.96875,
      "completions/mean_terminated_length": 660.1111450195312,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 3.0246913580246915,
      "grad_norm": 2.0156040990052833,
      "kl": 0.241943359375,
      "learning_rate": 1.78760518888722e-07,
      "loss": -0.1204,
      "num_tokens": 27660368.0,
      "reward": 0.03233620151877403,
      "reward_std": 0.027743151411414146,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0012068887008354068,
      "rewards/logprob_reward/std": 0.0026189556811004877,
      "step": 980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 631.84375,
      "completions/mean_terminated_length": 619.1935424804688,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 3.0277777777777777,
      "grad_norm": 1.9160075974092692,
      "kl": 0.2626953125,
      "learning_rate": 1.782814537723617e-07,
      "loss": -0.0237,
      "num_tokens": 27686723.0,
      "reward": 0.05130518600344658,
      "reward_std": 0.03933805599808693,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.001450207200832665,
      "rewards/logprob_reward/std": 0.0026596777606755495,
      "step": 981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 738.125,
      "completions/mean_terminated_length": 708.5516967773438,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 3.0308641975308643,
      "grad_norm": 3.7638909857862006,
      "kl": 0.2254638671875,
      "learning_rate": 1.7780267545571175e-07,
      "loss": -0.2725,
      "num_tokens": 27716895.0,
      "reward": 0.04446043074131012,
      "reward_std": 0.04653332382440567,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0007893663714639843,
      "rewards/logprob_reward/std": 0.002341961720958352,
      "step": 982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 690.03125,
      "completions/mean_terminated_length": 679.258056640625,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 3.0339506172839505,
      "grad_norm": 2.510787819377592,
      "kl": 0.2392578125,
      "learning_rate": 1.7732418585338804e-07,
      "loss": -0.1958,
      "num_tokens": 27745608.0,
      "reward": 0.035718273371458054,
      "reward_std": 0.044114019721746445,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0014925259165465832,
      "rewards/logprob_reward/std": 0.0029040889348834753,
      "step": 983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 751.5625,
      "completions/mean_terminated_length": 723.3793334960938,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.037037037037037,
      "grad_norm": 2.3752642757269244,
      "kl": 0.2181396484375,
      "learning_rate": 1.7684598687885216e-07,
      "loss": -0.2773,
      "num_tokens": 27776058.0,
      "reward": 0.0384933166205883,
      "reward_std": 0.04638366401195526,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.001103683258406818,
      "rewards/logprob_reward/std": 0.002340720733627677,
      "step": 984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 672.15625,
      "completions/mean_terminated_length": 660.8064575195312,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 3.0401234567901234,
      "grad_norm": 2.0222497224829206,
      "kl": 0.2366943359375,
      "learning_rate": 1.7636808044440344e-07,
      "loss": -0.1634,
      "num_tokens": 27804003.0,
      "reward": 0.051867835223674774,
      "reward_std": 0.04846703261137009,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0020753692369908094,
      "rewards/logprob_reward/std": 0.004790341481566429,
      "step": 985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 719.40625,
      "completions/mean_terminated_length": 699.1000366210938,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 3.04320987654321,
      "grad_norm": 2.5950314072937557,
      "kl": 0.223388671875,
      "learning_rate": 1.7589046846117132e-07,
      "loss": -0.2739,
      "num_tokens": 27833460.0,
      "reward": 0.045012228190898895,
      "reward_std": 0.047928906977176666,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0014024768024682999,
      "rewards/logprob_reward/std": 0.0025697017554193735,
      "step": 986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 822.0,
      "completions/mean_length": 684.78125,
      "completions/mean_terminated_length": 662.1666870117188,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 3.0462962962962963,
      "grad_norm": 2.3693857370894746,
      "kl": 0.2379150390625,
      "learning_rate": 1.754131528391078e-07,
      "loss": -0.2839,
      "num_tokens": 27861725.0,
      "reward": 0.04861406981945038,
      "reward_std": 0.049128152430057526,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0019322949228808284,
      "rewards/logprob_reward/std": 0.0037612488958984613,
      "step": 987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 663.03125,
      "completions/mean_terminated_length": 638.9666748046875,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 3.049382716049383,
      "grad_norm": 2.3964764203097415,
      "kl": 0.2425537109375,
      "learning_rate": 1.7493613548697966e-07,
      "loss": -0.1999,
      "num_tokens": 27889606.0,
      "reward": 0.044594209641218185,
      "reward_std": 0.04765457659959793,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0009380142437294126,
      "rewards/logprob_reward/std": 0.0017897032666951418,
      "step": 988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 706.125,
      "completions/mean_terminated_length": 673.2413940429688,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 3.052469135802469,
      "grad_norm": 3.0514295695731266,
      "kl": 0.2366943359375,
      "learning_rate": 1.744594183123611e-07,
      "loss": -0.2699,
      "num_tokens": 27919190.0,
      "reward": 0.0392814502120018,
      "reward_std": 0.043264396488666534,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0019793917890638113,
      "rewards/logprob_reward/std": 0.003085510805249214,
      "step": 989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 874.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 618.75,
      "completions/mean_terminated_length": 618.75,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 3.0555555555555554,
      "grad_norm": 1.9141571541766174,
      "kl": 0.227783203125,
      "learning_rate": 1.7398300322162563e-07,
      "loss": -0.1182,
      "num_tokens": 27945302.0,
      "reward": 0.03834158182144165,
      "reward_std": 0.04912228882312775,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0009350912878289819,
      "rewards/logprob_reward/std": 0.0020096460357308388,
      "step": 990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 685.78125,
      "completions/mean_terminated_length": 663.2333984375,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 3.058641975308642,
      "grad_norm": 2.431587339854599,
      "kl": 0.2183837890625,
      "learning_rate": 1.7350689211993902e-07,
      "loss": -0.296,
      "num_tokens": 27973827.0,
      "reward": 0.03231380879878998,
      "reward_std": 0.03982333093881607,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0011820093495771289,
      "rewards/logprob_reward/std": 0.0032097590155899525,
      "step": 991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 698.40625,
      "completions/mean_terminated_length": 676.7000122070312,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 3.0617283950617282,
      "grad_norm": 2.468386918464936,
      "kl": 0.22802734375,
      "learning_rate": 1.7303108691125107e-07,
      "loss": -0.1473,
      "num_tokens": 28002588.0,
      "reward": 0.03651568293571472,
      "reward_std": 0.03315194696187973,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0023785370867699385,
      "rewards/logprob_reward/std": 0.002841934096068144,
      "step": 992
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 818.15625,
      "completions/mean_terminated_length": 760.5199584960938,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 3.064814814814815,
      "grad_norm": 2.054287674639405,
      "kl": NaN,
      "learning_rate": 1.725555894982887e-07,
      "loss": -0.2509,
      "num_tokens": 28036217.0,
      "reward": 0.025921491906046867,
      "reward_std": 0.03331119567155838,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0010238795075565577,
      "rewards/logprob_reward/std": 0.0026635329704731703,
      "step": 993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 718.1875,
      "completions/mean_terminated_length": 708.3225708007812,
      "completions/min_length": 532.0,
      "completions/min_terminated_length": 532.0,
      "epoch": 3.067901234567901,
      "grad_norm": 1.999015239923265,
      "kl": 0.2410888671875,
      "learning_rate": 1.7208040178254768e-07,
      "loss": -0.0784,
      "num_tokens": 28065959.0,
      "reward": 0.03385654091835022,
      "reward_std": 0.034333594143390656,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0028961554635316133,
      "rewards/logprob_reward/std": 0.005538558587431908,
      "step": 994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 990.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 695.21875,
      "completions/mean_terminated_length": 695.21875,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 3.0709876543209877,
      "grad_norm": 3.0853137953398457,
      "kl": 0.2315673828125,
      "learning_rate": 1.716055256642855e-07,
      "loss": -0.3443,
      "num_tokens": 28095138.0,
      "reward": 0.0421161986887455,
      "reward_std": 0.047297537326812744,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0016568891005590558,
      "rewards/logprob_reward/std": 0.003244540421292186,
      "step": 995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 701.53125,
      "completions/mean_terminated_length": 691.1290283203125,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 3.074074074074074,
      "grad_norm": 2.3533149446198283,
      "kl": 0.20751953125,
      "learning_rate": 1.711309630425135e-07,
      "loss": -0.364,
      "num_tokens": 28123683.0,
      "reward": 0.05214660242199898,
      "reward_std": 0.041597574949264526,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0023851157166063786,
      "rewards/logprob_reward/std": 0.008369777351617813,
      "step": 996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 646.0625,
      "completions/mean_terminated_length": 620.86669921875,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 3.0771604938271606,
      "grad_norm": 2.593518711271626,
      "kl": 0.2825927734375,
      "learning_rate": 1.7065671581498936e-07,
      "loss": -0.2214,
      "num_tokens": 28150341.0,
      "reward": 0.02638131007552147,
      "reward_std": 0.032486990094184875,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0015347886364907026,
      "rewards/logprob_reward/std": 0.003711605677381158,
      "step": 997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 702.03125,
      "completions/mean_terminated_length": 668.72412109375,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 3.080246913580247,
      "grad_norm": 2.2021572076631157,
      "kl": 0.26025390625,
      "learning_rate": 1.701827858782095e-07,
      "loss": -0.1954,
      "num_tokens": 28178682.0,
      "reward": 0.029635073617100716,
      "reward_std": 0.02696829102933407,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0016778578283265233,
      "rewards/logprob_reward/std": 0.0037237918004393578,
      "step": 998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 738.875,
      "completions/mean_terminated_length": 729.6773681640625,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 3.0833333333333335,
      "grad_norm": 2.0928816058538193,
      "kl": 0.2159423828125,
      "learning_rate": 1.697091751274016e-07,
      "loss": -0.1492,
      "num_tokens": 28209202.0,
      "reward": 0.041847534477710724,
      "reward_std": 0.04098542034626007,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0013583700638264418,
      "rewards/logprob_reward/std": 0.0031691337935626507,
      "step": 999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 961.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 722.3125,
      "completions/mean_terminated_length": 722.3125,
      "completions/min_length": 527.0,
      "completions/min_terminated_length": 527.0,
      "epoch": 3.0864197530864197,
      "grad_norm": 1.712853241916546,
      "kl": 0.2030029296875,
      "learning_rate": 1.6923588545651672e-07,
      "loss": -0.0253,
      "num_tokens": 28239016.0,
      "reward": 0.04628859832882881,
      "reward_std": 0.03862706571817398,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002820664783939719,
      "rewards/logprob_reward/std": 0.0033508921042084694,
      "step": 1000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 738.65625,
      "completions/mean_terminated_length": 709.137939453125,
      "completions/min_length": 534.0,
      "completions/min_terminated_length": 534.0,
      "epoch": 3.0895061728395063,
      "grad_norm": 2.1370810475376314,
      "kl": 0.22802734375,
      "learning_rate": 1.687629187582221e-07,
      "loss": -0.1515,
      "num_tokens": 28269029.0,
      "reward": 0.012849628925323486,
      "reward_std": 0.025137685239315033,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.00038847571704536676,
      "rewards/logprob_reward/std": 0.001013743574731052,
      "step": 1001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 695.53125,
      "completions/mean_terminated_length": 684.9354858398438,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 3.0925925925925926,
      "grad_norm": 1.8238148773066314,
      "kl": 0.242431640625,
      "learning_rate": 1.6829027692389343e-07,
      "loss": -0.0121,
      "num_tokens": 28297110.0,
      "reward": 0.04439171403646469,
      "reward_std": 0.04510121047496796,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0007130148005671799,
      "rewards/logprob_reward/std": 0.001146567054092884,
      "step": 1002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 755.6875,
      "completions/mean_terminated_length": 680.5599975585938,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 3.095679012345679,
      "grad_norm": 1.855925683973445,
      "kl": 0.23095703125,
      "learning_rate": 1.678179618436073e-07,
      "loss": -0.0604,
      "num_tokens": 28327868.0,
      "reward": 0.02557981386780739,
      "reward_std": 0.0329207107424736,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0006442366866394877,
      "rewards/logprob_reward/std": 0.0016100271604955196,
      "step": 1003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 700.3125,
      "completions/mean_terminated_length": 678.7333984375,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 3.0987654320987654,
      "grad_norm": 1.8113067304562025,
      "kl": 0.2279052734375,
      "learning_rate": 1.6734597540613344e-07,
      "loss": -0.101,
      "num_tokens": 28356466.0,
      "reward": 0.044626496732234955,
      "reward_std": 0.04724197834730148,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0009738857625052333,
      "rewards/logprob_reward/std": 0.0015569920651614666,
      "step": 1004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 733.46875,
      "completions/mean_terminated_length": 703.413818359375,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 3.1018518518518516,
      "grad_norm": 3.41024478756569,
      "kl": 0.2373046875,
      "learning_rate": 1.6687431949892753e-07,
      "loss": -0.4791,
      "num_tokens": 28386333.0,
      "reward": 0.03247016295790672,
      "reward_std": 0.045533549040555954,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.001355737796984613,
      "rewards/logprob_reward/std": 0.0025667925365269184,
      "step": 1005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 732.5,
      "completions/mean_terminated_length": 702.3448486328125,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.1049382716049383,
      "grad_norm": 2.131559011515287,
      "kl": 0.22900390625,
      "learning_rate": 1.664029960081234e-07,
      "loss": -0.1937,
      "num_tokens": 28416165.0,
      "reward": 0.0485088974237442,
      "reward_std": 0.034199196845293045,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001815442810766399,
      "rewards/logprob_reward/std": 0.0028548906557261944,
      "step": 1006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 864.0,
      "completions/mean_length": 677.53125,
      "completions/mean_terminated_length": 641.6896362304688,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 3.1080246913580245,
      "grad_norm": 2.070636568687511,
      "kl": 0.23876953125,
      "learning_rate": 1.6593200681852574e-07,
      "loss": -0.1072,
      "num_tokens": 28444226.0,
      "reward": 0.035720594227313995,
      "reward_std": 0.04157908633351326,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0014951014891266823,
      "rewards/logprob_reward/std": 0.0026378934271633625,
      "step": 1007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 731.34375,
      "completions/mean_terminated_length": 701.0689697265625,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 3.111111111111111,
      "grad_norm": 1.9857547587384312,
      "kl": 0.232421875,
      "learning_rate": 1.6546135381360194e-07,
      "loss": -0.1387,
      "num_tokens": 28474009.0,
      "reward": 0.03834307938814163,
      "reward_std": 0.03995034843683243,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0009367556776851416,
      "rewards/logprob_reward/std": 0.0020925123244524,
      "step": 1008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 737.125,
      "completions/mean_terminated_length": 718.0000610351562,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.1141975308641974,
      "grad_norm": 4.697781797002306,
      "kl": 0.2415771484375,
      "learning_rate": 1.6499103887547544e-07,
      "loss": -0.2556,
      "num_tokens": 28504245.0,
      "reward": 0.0329495370388031,
      "reward_std": 0.040524132549762726,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0018883757293224335,
      "rewards/logprob_reward/std": 0.003306453814730048,
      "step": 1009
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 749.78125,
      "completions/mean_terminated_length": 721.413818359375,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 3.117283950617284,
      "grad_norm": 1.8396823859155615,
      "kl": 0.202880859375,
      "learning_rate": 1.6452106388491762e-07,
      "loss": -0.3582,
      "num_tokens": 28535226.0,
      "reward": 0.03948305547237396,
      "reward_std": 0.04296427220106125,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002203392330557108,
      "rewards/logprob_reward/std": 0.004145064856857061,
      "step": 1010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 717.28125,
      "completions/mean_terminated_length": 696.8333740234375,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 3.1203703703703702,
      "grad_norm": 2.3819930186029965,
      "kl": 0.24072265625,
      "learning_rate": 1.6405143072134031e-07,
      "loss": -0.1811,
      "num_tokens": 28564647.0,
      "reward": 0.04897215962409973,
      "reward_std": 0.04186669737100601,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002330177929252386,
      "rewards/logprob_reward/std": 0.004359105136245489,
      "step": 1011
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 728.59375,
      "completions/mean_terminated_length": 708.9000244140625,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 3.123456790123457,
      "grad_norm": 2.987956025095811,
      "kl": 0.238525390625,
      "learning_rate": 1.6358214126278855e-07,
      "loss": -0.4203,
      "num_tokens": 28594550.0,
      "reward": 0.03603680804371834,
      "reward_std": 0.027981655672192574,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0018464522436261177,
      "rewards/logprob_reward/std": 0.003124150214716792,
      "step": 1012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 808.0,
      "completions/mean_length": 685.65625,
      "completions/mean_terminated_length": 637.3214721679688,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 3.126543209876543,
      "grad_norm": 1.7103195077043507,
      "kl": 0.25830078125,
      "learning_rate": 1.6311319738593281e-07,
      "loss": -0.0873,
      "num_tokens": 28622875.0,
      "reward": 0.04240027815103531,
      "reward_std": 0.02873365208506584,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.00197252887301147,
      "rewards/logprob_reward/std": 0.0038132520858198404,
      "step": 1013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 701.125,
      "completions/mean_terminated_length": 690.7096557617188,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 3.1296296296296298,
      "grad_norm": 2.042134028680897,
      "kl": 0.235107421875,
      "learning_rate": 1.6264460096606169e-07,
      "loss": -0.275,
      "num_tokens": 28651531.0,
      "reward": 0.04480944573879242,
      "reward_std": 0.04709518328309059,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0011771632125601172,
      "rewards/logprob_reward/std": 0.0018997747683897614,
      "step": 1014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 848.0,
      "completions/mean_length": 683.0625,
      "completions/mean_terminated_length": 660.3333740234375,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.132716049382716,
      "grad_norm": 2.3774079561933714,
      "kl": 0.236572265625,
      "learning_rate": 1.621763538770743e-07,
      "loss": -0.195,
      "num_tokens": 28679949.0,
      "reward": 0.016525961458683014,
      "reward_std": 0.026975713670253754,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.001001067110337317,
      "rewards/logprob_reward/std": 0.0017371024005115032,
      "step": 1015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 773.75,
      "completions/mean_terminated_length": 738.0000610351562,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 3.1358024691358026,
      "grad_norm": 2.4172906761602793,
      "kl": 0.1939697265625,
      "learning_rate": 1.6170845799147266e-07,
      "loss": -0.2543,
      "num_tokens": 28711293.0,
      "reward": 0.02256914973258972,
      "reward_std": 0.039598651230335236,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0007712763035669923,
      "rewards/logprob_reward/std": 0.0014853577595204115,
      "step": 1016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 745.03125,
      "completions/mean_terminated_length": 680.6538696289062,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 3.138888888888889,
      "grad_norm": 2.2310177139275753,
      "kl": 0.211669921875,
      "learning_rate": 1.6124091518035443e-07,
      "loss": -0.2689,
      "num_tokens": 28741122.0,
      "reward": 0.0354393795132637,
      "reward_std": 0.051835887134075165,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0011826427653431892,
      "rewards/logprob_reward/std": 0.0019029693212360144,
      "step": 1017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 700.5625,
      "completions/mean_terminated_length": 679.0000610351562,
      "completions/min_length": 340.0,
      "completions/min_terminated_length": 340.0,
      "epoch": 3.1419753086419755,
      "grad_norm": 3.6628987143442293,
      "kl": 0.253662109375,
      "learning_rate": 1.607737273134054e-07,
      "loss": -0.2316,
      "num_tokens": 28769980.0,
      "reward": 0.05108967423439026,
      "reward_std": 0.04814573749899864,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0012107489164918661,
      "rewards/logprob_reward/std": 0.0027121107559651136,
      "step": 1018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 754.25,
      "completions/mean_terminated_length": 726.3448486328125,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 3.1450617283950617,
      "grad_norm": 2.639872367878862,
      "kl": 0.22802734375,
      "learning_rate": 1.603068962588918e-07,
      "loss": -0.3494,
      "num_tokens": 28800628.0,
      "reward": 0.02560967206954956,
      "reward_std": 0.04000038653612137,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0006774109788239002,
      "rewards/logprob_reward/std": 0.0017976548988372087,
      "step": 1019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 698.03125,
      "completions/mean_terminated_length": 664.3103637695312,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 3.148148148148148,
      "grad_norm": 2.3527739878713354,
      "kl": 0.2325439453125,
      "learning_rate": 1.598404238836532e-07,
      "loss": -0.1972,
      "num_tokens": 28829805.0,
      "reward": 0.047809161245822906,
      "reward_std": 0.047749098390340805,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0010379606392234564,
      "rewards/logprob_reward/std": 0.0016769595677033067,
      "step": 1020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 760.4375,
      "completions/mean_terminated_length": 742.86669921875,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 3.1512345679012346,
      "grad_norm": 2.0403487129467446,
      "kl": 0.2138671875,
      "learning_rate": 1.5937431205309465e-07,
      "loss": -0.1521,
      "num_tokens": 28861047.0,
      "reward": 0.03904839605093002,
      "reward_std": 0.038639068603515625,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0017204422038048506,
      "rewards/logprob_reward/std": 0.0029273114632815123,
      "step": 1021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 760.4375,
      "completions/mean_terminated_length": 742.86669921875,
      "completions/min_length": 537.0,
      "completions/min_terminated_length": 537.0,
      "epoch": 3.154320987654321,
      "grad_norm": 1.7217313677726709,
      "kl": 0.245849609375,
      "learning_rate": 1.589085626311795e-07,
      "loss": -0.0634,
      "num_tokens": 28891781.0,
      "reward": 0.03565390780568123,
      "reward_std": 0.03870029002428055,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0014210063964128494,
      "rewards/logprob_reward/std": 0.0025313065852969885,
      "step": 1022
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 731.03125,
      "completions/mean_terminated_length": 700.72412109375,
      "completions/min_length": 498.0,
      "completions/min_terminated_length": 498.0,
      "epoch": 3.1574074074074074,
      "grad_norm": 2.2566933995265686,
      "kl": 0.2421875,
      "learning_rate": 1.5844317748042167e-07,
      "loss": -0.1424,
      "num_tokens": 28921954.0,
      "reward": 0.04573764652013779,
      "reward_std": 0.03755033761262894,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0022084980737417936,
      "rewards/logprob_reward/std": 0.003018285147845745,
      "step": 1023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 775.8125,
      "completions/mean_terminated_length": 718.5385131835938,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 3.1604938271604937,
      "grad_norm": 1.9696639810162537,
      "kl": 0.2086181640625,
      "learning_rate": 1.5797815846187868e-07,
      "loss": -0.2129,
      "num_tokens": 28953320.0,
      "reward": 0.0224867295473814,
      "reward_std": 0.03390219807624817,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0006796991219744086,
      "rewards/logprob_reward/std": 0.0016114730387926102,
      "step": 1024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 972.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 718.1875,
      "completions/mean_terminated_length": 718.1875,
      "completions/min_length": 538.0,
      "completions/min_terminated_length": 538.0,
      "epoch": 3.1635802469135803,
      "grad_norm": 2.1648531153637016,
      "kl": 0.2083740234375,
      "learning_rate": 1.575135074351435e-07,
      "loss": -0.1379,
      "num_tokens": 28982930.0,
      "reward": 0.034196820110082626,
      "reward_std": 0.03071579523384571,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0032742430921643972,
      "rewards/logprob_reward/std": 0.0059987688437104225,
      "step": 1025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 736.125,
      "completions/mean_terminated_length": 706.3448486328125,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 3.1666666666666665,
      "grad_norm": 2.182235340928639,
      "kl": 0.2423095703125,
      "learning_rate": 1.5704922625833784e-07,
      "loss": -0.1535,
      "num_tokens": 29013358.0,
      "reward": 0.022522062063217163,
      "reward_std": 0.027043446898460388,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0007189557072706521,
      "rewards/logprob_reward/std": 0.0018420717678964138,
      "step": 1026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 712.375,
      "completions/mean_terminated_length": 680.137939453125,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 3.169753086419753,
      "grad_norm": 2.301605215830569,
      "kl": 0.226806640625,
      "learning_rate": 1.565853167881042e-07,
      "loss": -0.2366,
      "num_tokens": 29042442.0,
      "reward": 0.041161030530929565,
      "reward_std": 0.040761448442935944,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0005955874803476036,
      "rewards/logprob_reward/std": 0.0013528020353987813,
      "step": 1027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 690.78125,
      "completions/mean_terminated_length": 668.5667114257812,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 3.1728395061728394,
      "grad_norm": 1.7288588696954923,
      "kl": 0.2275390625,
      "learning_rate": 1.5612178087959887e-07,
      "loss": -0.0607,
      "num_tokens": 29070931.0,
      "reward": 0.05454988032579422,
      "reward_std": 0.03911234438419342,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0015832000644877553,
      "rewards/logprob_reward/std": 0.0027248659171164036,
      "step": 1028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 675.4375,
      "completions/mean_terminated_length": 664.1935424804688,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 3.175925925925926,
      "grad_norm": 2.007448995328529,
      "kl": 0.25830078125,
      "learning_rate": 1.556586203864841e-07,
      "loss": -0.237,
      "num_tokens": 29098761.0,
      "reward": 0.03903896361589432,
      "reward_std": 0.04599328339099884,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0017099580727517605,
      "rewards/logprob_reward/std": 0.0027005146257579327,
      "step": 1029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 767.78125,
      "completions/mean_terminated_length": 731.1785888671875,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 3.1790123456790123,
      "grad_norm": 2.224659062172624,
      "kl": 0.2608642578125,
      "learning_rate": 1.5519583716092077e-07,
      "loss": -0.1733,
      "num_tokens": 29130854.0,
      "reward": 0.02665591612458229,
      "reward_std": 0.033299244940280914,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0018399073742330074,
      "rewards/logprob_reward/std": 0.003487040288746357,
      "step": 1030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 797.25,
      "completions/mean_terminated_length": 782.1333618164062,
      "completions/min_length": 526.0,
      "completions/min_terminated_length": 526.0,
      "epoch": 3.182098765432099,
      "grad_norm": 2.2077104602188977,
      "kl": 0.1951904296875,
      "learning_rate": 1.5473343305356136e-07,
      "loss": -0.1501,
      "num_tokens": 29163074.0,
      "reward": 0.03273075073957443,
      "reward_std": 0.04101334884762764,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0016452809795737267,
      "rewards/logprob_reward/std": 0.003429353702813387,
      "step": 1031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 682.65625,
      "completions/mean_terminated_length": 671.6451416015625,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 3.185185185185185,
      "grad_norm": 2.987474498192784,
      "kl": 0.22998046875,
      "learning_rate": 1.5427140991354215e-07,
      "loss": -0.3532,
      "num_tokens": 29191467.0,
      "reward": 0.016893979161977768,
      "reward_std": 0.022237852215766907,
      "rewards/format_reward_func/mean": 0.15625,
      "rewards/format_reward_func/std": 0.3689020276069641,
      "rewards/logprob_reward/mean": 0.0014099783729761839,
      "rewards/logprob_reward/std": 0.0024758463259786367,
      "step": 1032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 713.59375,
      "completions/mean_terminated_length": 681.4827270507812,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 3.1882716049382718,
      "grad_norm": 2.205118304246579,
      "kl": 0.22509765625,
      "learning_rate": 1.5380976958847572e-07,
      "loss": -0.2685,
      "num_tokens": 29220790.0,
      "reward": 0.038550931960344315,
      "reward_std": 0.04296690225601196,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0011676998110488057,
      "rewards/logprob_reward/std": 0.0033502120058983564,
      "step": 1033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 830.875,
      "completions/mean_terminated_length": 766.5,
      "completions/min_length": 556.0,
      "completions/min_terminated_length": 556.0,
      "epoch": 3.191358024691358,
      "grad_norm": 2.122485568618202,
      "kl": 0.2191162109375,
      "learning_rate": 1.5334851392444412e-07,
      "loss": -0.2083,
      "num_tokens": 29254654.0,
      "reward": 0.03013978898525238,
      "reward_std": 0.04280904680490494,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0022386545315384865,
      "rewards/logprob_reward/std": 0.003554455004632473,
      "step": 1034
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 736.375,
      "completions/mean_terminated_length": 717.2000122070312,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 3.1944444444444446,
      "grad_norm": 2.1343672273445065,
      "kl": NaN,
      "learning_rate": 1.5288764476599102e-07,
      "loss": -0.1414,
      "num_tokens": 29284462.0,
      "reward": 0.028765369206666946,
      "reward_std": 0.033886875957250595,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0007115213084034622,
      "rewards/logprob_reward/std": 0.0022839908488094807,
      "step": 1035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 697.90625,
      "completions/mean_terminated_length": 676.1666870117188,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 3.197530864197531,
      "grad_norm": 3.3306503208284592,
      "kl": 0.2279052734375,
      "learning_rate": 1.524271639561145e-07,
      "loss": -0.2411,
      "num_tokens": 29313527.0,
      "reward": 0.050074078142642975,
      "reward_std": 0.03632235527038574,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003554530907422304,
      "rewards/logprob_reward/std": 0.006470814347267151,
      "step": 1036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 842.0,
      "completions/mean_length": 710.0625,
      "completions/mean_terminated_length": 677.586181640625,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 3.200617283950617,
      "grad_norm": 2.07390040032122,
      "kl": 0.2005615234375,
      "learning_rate": 1.5196707333625959e-07,
      "loss": -0.1393,
      "num_tokens": 29342629.0,
      "reward": 0.03282039985060692,
      "reward_std": 0.04835759848356247,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0017448903527110815,
      "rewards/logprob_reward/std": 0.0045485785230994225,
      "step": 1037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 750.8125,
      "completions/mean_terminated_length": 742.0,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 3.2037037037037037,
      "grad_norm": 1.887487126054076,
      "kl": 0.2208251953125,
      "learning_rate": 1.5150737474631092e-07,
      "loss": -0.1085,
      "num_tokens": 29372939.0,
      "reward": 0.04539516195654869,
      "reward_std": 0.049080513417720795,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001827956992201507,
      "rewards/logprob_reward/std": 0.0025081937201321125,
      "step": 1038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 965.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 685.28125,
      "completions/mean_terminated_length": 685.28125,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 3.20679012345679,
      "grad_norm": 2.1564246168720596,
      "kl": 0.2523193359375,
      "learning_rate": 1.5104807002458564e-07,
      "loss": -0.0752,
      "num_tokens": 29400848.0,
      "reward": 0.04461712762713432,
      "reward_std": 0.05231209099292755,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0009634761372581124,
      "rewards/logprob_reward/std": 0.0015501893358305097,
      "step": 1039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 699.9375,
      "completions/mean_terminated_length": 689.4838256835938,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.2098765432098766,
      "grad_norm": 2.0727946114675793,
      "kl": 0.2550048828125,
      "learning_rate": 1.5058916100782555e-07,
      "loss": -0.146,
      "num_tokens": 29429434.0,
      "reward": 0.03488758206367493,
      "reward_std": 0.04631597176194191,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0005695360014215112,
      "rewards/logprob_reward/std": 0.0011281168553978205,
      "step": 1040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 989.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 651.09375,
      "completions/mean_terminated_length": 651.09375,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 3.212962962962963,
      "grad_norm": 1.7391712881031045,
      "kl": 0.2294921875,
      "learning_rate": 1.5013064953119036e-07,
      "loss": -0.1336,
      "num_tokens": 29456773.0,
      "reward": 0.04492807388305664,
      "reward_std": 0.040203481912612915,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0013089715503156185,
      "rewards/logprob_reward/std": 0.0030327115673571825,
      "step": 1041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 648.84375,
      "completions/mean_terminated_length": 636.741943359375,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.2160493827160495,
      "grad_norm": 2.5413915392621638,
      "kl": 0.23828125,
      "learning_rate": 1.4967253742824962e-07,
      "loss": -0.122,
      "num_tokens": 29483516.0,
      "reward": 0.05759654939174652,
      "reward_std": 0.047098174691200256,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0014961676206439734,
      "rewards/logprob_reward/std": 0.002962657017633319,
      "step": 1042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 712.5625,
      "completions/mean_terminated_length": 668.0714721679688,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 3.2191358024691357,
      "grad_norm": 1.930893478817174,
      "kl": 0.2281494140625,
      "learning_rate": 1.4921482653097614e-07,
      "loss": -0.1961,
      "num_tokens": 29512698.0,
      "reward": 0.029292644932866096,
      "reward_std": 0.027006058022379875,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0012973814737051725,
      "rewards/logprob_reward/std": 0.0025804650504142046,
      "step": 1043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 962.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 655.09375,
      "completions/mean_terminated_length": 655.09375,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 3.2222222222222223,
      "grad_norm": 2.4608711727614896,
      "kl": 0.23681640625,
      "learning_rate": 1.487575186697381e-07,
      "loss": -0.2578,
      "num_tokens": 29539953.0,
      "reward": 0.050926871597766876,
      "reward_std": 0.0479266420006752,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0010298553388565779,
      "rewards/logprob_reward/std": 0.0023105530999600887,
      "step": 1044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 729.4375,
      "completions/mean_terminated_length": 709.800048828125,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 3.2253086419753085,
      "grad_norm": 2.16844550061594,
      "kl": 0.2325439453125,
      "learning_rate": 1.4830061567329223e-07,
      "loss": -0.2563,
      "num_tokens": 29569231.0,
      "reward": 0.034893713891506195,
      "reward_std": 0.03588524088263512,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0005763471708633006,
      "rewards/logprob_reward/std": 0.0013616306241601706,
      "step": 1045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 753.0625,
      "completions/mean_terminated_length": 744.3225708007812,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 3.228395061728395,
      "grad_norm": 2.2636305900387246,
      "kl": 0.228759765625,
      "learning_rate": 1.4784411936877596e-07,
      "loss": -0.007,
      "num_tokens": 29600005.0,
      "reward": 0.036059677600860596,
      "reward_std": 0.04184982180595398,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0018718604696914554,
      "rewards/logprob_reward/std": 0.002798425266519189,
      "step": 1046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 811.0,
      "completions/mean_terminated_length": 771.5555419921875,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 3.2314814814814814,
      "grad_norm": 2.014576773630118,
      "kl": 0.21826171875,
      "learning_rate": 1.4738803158170043e-07,
      "loss": -0.0861,
      "num_tokens": 29632581.0,
      "reward": 0.031759947538375854,
      "reward_std": 0.04669643193483353,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.00751105509698391,
      "rewards/logprob_reward/std": 0.020503010600805283,
      "step": 1047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 748.21875,
      "completions/mean_terminated_length": 708.8214721679688,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 3.234567901234568,
      "grad_norm": 3.8583906524450953,
      "kl": 0.20849609375,
      "learning_rate": 1.469323541359433e-07,
      "loss": -0.4765,
      "num_tokens": 29662948.0,
      "reward": 0.038361284881830215,
      "reward_std": 0.047246262431144714,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0009569848771207035,
      "rewards/logprob_reward/std": 0.0018392475321888924,
      "step": 1048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 747.96875,
      "completions/mean_terminated_length": 719.413818359375,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.2376543209876543,
      "grad_norm": 1.7135235071658506,
      "kl": 0.2265625,
      "learning_rate": 1.4647708885374105e-07,
      "loss": -0.0541,
      "num_tokens": 29693435.0,
      "reward": 0.019259363412857056,
      "reward_std": 0.03267936408519745,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0005659585003741086,
      "rewards/logprob_reward/std": 0.0018487462075427175,
      "step": 1049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 699.5625,
      "completions/mean_terminated_length": 666.0,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 3.240740740740741,
      "grad_norm": 2.0112628842864004,
      "kl": 0.225830078125,
      "learning_rate": 1.4602223755568212e-07,
      "loss": -0.0747,
      "num_tokens": 29721841.0,
      "reward": 0.030123358592391014,
      "reward_std": 0.03439383953809738,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0022203982807695866,
      "rewards/logprob_reward/std": 0.004442084114998579,
      "step": 1050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 751.3125,
      "completions/mean_terminated_length": 723.1034545898438,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 3.243827160493827,
      "grad_norm": 2.0339949808550184,
      "kl": 0.2362060546875,
      "learning_rate": 1.4556780206069925e-07,
      "loss": -0.1813,
      "num_tokens": 29752639.0,
      "reward": 0.036101654171943665,
      "reward_std": 0.0347515270113945,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0019185070414096117,
      "rewards/logprob_reward/std": 0.003167710965499282,
      "step": 1051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 671.9375,
      "completions/mean_terminated_length": 606.74072265625,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 3.246913580246914,
      "grad_norm": 1.8769141810596117,
      "kl": 0.25146484375,
      "learning_rate": 1.4511378418606272e-07,
      "loss": -0.013,
      "num_tokens": 29780501.0,
      "reward": 0.05136559158563614,
      "reward_std": 0.027284620329737663,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0015173237770795822,
      "rewards/logprob_reward/std": 0.0020197611302137375,
      "step": 1052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 692.25,
      "completions/mean_terminated_length": 670.1333618164062,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 3.25,
      "grad_norm": 2.0115822847084726,
      "kl": 0.2269287109375,
      "learning_rate": 1.4466018574737236e-07,
      "loss": -0.1848,
      "num_tokens": 29808813.0,
      "reward": 0.051165372133255005,
      "reward_std": 0.053093306720256805,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0012948578223586082,
      "rewards/logprob_reward/std": 0.0020043007098138332,
      "step": 1053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 736.6875,
      "completions/mean_terminated_length": 683.4815063476562,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 3.253086419753086,
      "grad_norm": 1.7710297363346417,
      "kl": 0.227294921875,
      "learning_rate": 1.4420700855855093e-07,
      "loss": -0.1326,
      "num_tokens": 29838807.0,
      "reward": 0.04775507003068924,
      "reward_std": 0.05113762617111206,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.000977854710072279,
      "rewards/logprob_reward/std": 0.0016442429041489959,
      "step": 1054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 706.40625,
      "completions/mean_terminated_length": 685.2333984375,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 3.256172839506173,
      "grad_norm": 3.2144726880549355,
      "kl": 0.22900390625,
      "learning_rate": 1.4375425443183675e-07,
      "loss": -0.3769,
      "num_tokens": 29867828.0,
      "reward": 0.039484698325395584,
      "reward_std": 0.05072614923119545,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0022052209824323654,
      "rewards/logprob_reward/std": 0.0035095112398266792,
      "step": 1055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 697.84375,
      "completions/mean_terminated_length": 687.3225708007812,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.259259259259259,
      "grad_norm": 2.187334941470878,
      "kl": 0.2265625,
      "learning_rate": 1.43301925177776e-07,
      "loss": -0.0828,
      "num_tokens": 29896563.0,
      "reward": 0.04186723381280899,
      "reward_std": 0.043973658233881,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.001380261266604066,
      "rewards/logprob_reward/std": 0.001648860750719905,
      "step": 1056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 728.71875,
      "completions/mean_terminated_length": 698.1724243164062,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 3.2623456790123457,
      "grad_norm": 1.792696979967954,
      "kl": 0.2396240234375,
      "learning_rate": 1.4285002260521617e-07,
      "loss": -0.1215,
      "num_tokens": 29926834.0,
      "reward": 0.04456322640180588,
      "reward_std": 0.028252314776182175,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0009035864495672286,
      "rewards/logprob_reward/std": 0.0018524707993492484,
      "step": 1057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 702.6875,
      "completions/mean_terminated_length": 681.2667236328125,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 3.265432098765432,
      "grad_norm": 1.8841740076052507,
      "kl": 0.3082275390625,
      "learning_rate": 1.4239854852129807e-07,
      "loss": 0.0148,
      "num_tokens": 29955612.0,
      "reward": 0.057417526841163635,
      "reward_std": 0.039684195071458817,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0012972512049600482,
      "rewards/logprob_reward/std": 0.0016004204517230392,
      "step": 1058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 727.03125,
      "completions/mean_terminated_length": 672.0370483398438,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 3.2685185185185186,
      "grad_norm": 1.9088485363841312,
      "kl": 0.2410888671875,
      "learning_rate": 1.419475047314493e-07,
      "loss": -0.129,
      "num_tokens": 29985173.0,
      "reward": 0.03619737550616264,
      "reward_std": 0.04631843417882919,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0020248633809387684,
      "rewards/logprob_reward/std": 0.003684094874188304,
      "step": 1059
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 761.15625,
      "completions/mean_terminated_length": 743.6333618164062,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 3.271604938271605,
      "grad_norm": 2.005587177258118,
      "kl": 0.21630859375,
      "learning_rate": 1.4149689303937662e-07,
      "loss": -0.0956,
      "num_tokens": 30016106.0,
      "reward": 0.041586898267269135,
      "reward_std": 0.04829026758670807,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0010687728645280004,
      "rewards/logprob_reward/std": 0.002006959868595004,
      "step": 1060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 722.34375,
      "completions/mean_terminated_length": 712.6128540039062,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 3.2746913580246915,
      "grad_norm": 1.94952303374902,
      "kl": 0.2384033203125,
      "learning_rate": 1.4104671524705892e-07,
      "loss": -0.1065,
      "num_tokens": 30045733.0,
      "reward": 0.060765333473682404,
      "reward_std": 0.05329817533493042,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015448182821273804,
      "rewards/logprob_reward/std": 0.0021833241917192936,
      "step": 1061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 700.15625,
      "completions/mean_terminated_length": 689.7096557617188,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 3.2777777777777777,
      "grad_norm": 1.9105227496199337,
      "kl": 0.2269287109375,
      "learning_rate": 1.4059697315473988e-07,
      "loss": -0.1209,
      "num_tokens": 30074278.0,
      "reward": 0.035946477204561234,
      "reward_std": 0.033595044165849686,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0017460857052356005,
      "rewards/logprob_reward/std": 0.00294461939483881,
      "step": 1062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 696.34375,
      "completions/mean_terminated_length": 685.774169921875,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 3.2808641975308643,
      "grad_norm": 3.7925798183141866,
      "kl": 0.2171630859375,
      "learning_rate": 1.4014766856092081e-07,
      "loss": -0.3987,
      "num_tokens": 30102937.0,
      "reward": 0.029342934489250183,
      "reward_std": 0.03597993403673172,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0013532602461054921,
      "rewards/logprob_reward/std": 0.0027918131090700626,
      "step": 1063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 772.53125,
      "completions/mean_terminated_length": 714.5,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 3.2839506172839505,
      "grad_norm": 2.6087865978155644,
      "kl": 0.2313232421875,
      "learning_rate": 1.3969880326235362e-07,
      "loss": -0.3567,
      "num_tokens": 30134278.0,
      "reward": 0.03363805264234543,
      "reward_std": 0.04130767285823822,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002653393428772688,
      "rewards/logprob_reward/std": 0.0037167624104768038,
      "step": 1064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 941.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 678.21875,
      "completions/mean_terminated_length": 678.21875,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 3.287037037037037,
      "grad_norm": 2.36546647548035,
      "kl": 0.2498779296875,
      "learning_rate": 1.3925037905403324e-07,
      "loss": -0.2612,
      "num_tokens": 30162613.0,
      "reward": 0.03613483905792236,
      "reward_std": 0.04617026448249817,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0019553746096789837,
      "rewards/logprob_reward/std": 0.004605346359312534,
      "step": 1065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 697.3125,
      "completions/mean_terminated_length": 686.774169921875,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 3.2901234567901234,
      "grad_norm": 2.2890774861387446,
      "kl": 0.229248046875,
      "learning_rate": 1.38802397729191e-07,
      "loss": -0.0201,
      "num_tokens": 30191727.0,
      "reward": 0.042044222354888916,
      "reward_std": 0.03896318003535271,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015769141027703881,
      "rewards/logprob_reward/std": 0.0022949024569243193,
      "step": 1066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 747.0625,
      "completions/mean_terminated_length": 728.6000366210938,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 3.29320987654321,
      "grad_norm": 3.3077592750464677,
      "kl": 0.2061767578125,
      "learning_rate": 1.3835486107928678e-07,
      "loss": -0.5235,
      "num_tokens": 30222065.0,
      "reward": 0.048837028443813324,
      "reward_std": 0.03456879034638405,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0021800282411277294,
      "rewards/logprob_reward/std": 0.0033928067423403263,
      "step": 1067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1005.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 687.9375,
      "completions/mean_terminated_length": 687.9375,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 3.2962962962962963,
      "grad_norm": 1.8257078762712673,
      "kl": 0.23486328125,
      "learning_rate": 1.3790777089400262e-07,
      "loss": -0.1296,
      "num_tokens": 30250055.0,
      "reward": 0.0545763298869133,
      "reward_std": 0.038820892572402954,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001612587017007172,
      "rewards/logprob_reward/std": 0.0026066070422530174,
      "step": 1068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 759.0,
      "completions/mean_terminated_length": 741.3333740234375,
      "completions/min_length": 570.0,
      "completions/min_terminated_length": 570.0,
      "epoch": 3.299382716049383,
      "grad_norm": 2.3746136697516618,
      "kl": 0.2366943359375,
      "learning_rate": 1.3746112896123494e-07,
      "loss": -0.2558,
      "num_tokens": 30281047.0,
      "reward": 0.05230496823787689,
      "reward_std": 0.049082618206739426,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002561077941209078,
      "rewards/logprob_reward/std": 0.0044354889541864395,
      "step": 1069
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 996.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 685.65625,
      "completions/mean_terminated_length": 685.65625,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 3.302469135802469,
      "grad_norm": 1.94869486964416,
      "kl": 0.244873046875,
      "learning_rate": 1.3701493706708768e-07,
      "loss": -0.1714,
      "num_tokens": 30309364.0,
      "reward": 0.04212292283773422,
      "reward_std": 0.03466665744781494,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0016643566777929664,
      "rewards/logprob_reward/std": 0.00276436610147357,
      "step": 1070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 717.09375,
      "completions/mean_terminated_length": 673.25,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 3.3055555555555554,
      "grad_norm": 2.319787808212525,
      "kl": 0.240966796875,
      "learning_rate": 1.3656919699586503e-07,
      "loss": -0.2518,
      "num_tokens": 30338719.0,
      "reward": 0.05116596817970276,
      "reward_std": 0.054804928600788116,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0012955210404470563,
      "rewards/logprob_reward/std": 0.0032878448255360126,
      "step": 1071
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 953.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 717.875,
      "completions/mean_terminated_length": 717.875,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 3.308641975308642,
      "grad_norm": 1.8546660989109238,
      "kl": 0.243896484375,
      "learning_rate": 1.3612391053006446e-07,
      "loss": -0.0398,
      "num_tokens": 30368087.0,
      "reward": 0.02947724051773548,
      "reward_std": 0.03402159735560417,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0015024887397885323,
      "rewards/logprob_reward/std": 0.0030562926549464464,
      "step": 1072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1009.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 727.3125,
      "completions/mean_terminated_length": 727.3125,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 3.3117283950617282,
      "grad_norm": 1.8364315932873658,
      "kl": 0.226318359375,
      "learning_rate": 1.356790794503694e-07,
      "loss": -0.0978,
      "num_tokens": 30398241.0,
      "reward": 0.05554766207933426,
      "reward_std": 0.04061705991625786,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0026918482035398483,
      "rewards/logprob_reward/std": 0.0036417266819626093,
      "step": 1073
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 746.40625,
      "completions/mean_terminated_length": 706.7500610351562,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.314814814814815,
      "grad_norm": 1.750053847049476,
      "kl": NaN,
      "learning_rate": 1.3523470553564238e-07,
      "loss": -0.0674,
      "num_tokens": 30429078.0,
      "reward": 0.025996271520853043,
      "reward_std": 0.020551001653075218,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0011069679167121649,
      "rewards/logprob_reward/std": 0.002348089125007391,
      "step": 1074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 738.96875,
      "completions/mean_terminated_length": 729.774169921875,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 3.317901234567901,
      "grad_norm": 2.5657590551853753,
      "kl": 0.229736328125,
      "learning_rate": 1.3479079056291738e-07,
      "loss": -0.2523,
      "num_tokens": 30459509.0,
      "reward": 0.045897215604782104,
      "reward_std": 0.04792949557304382,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002385790692642331,
      "rewards/logprob_reward/std": 0.003764881519600749,
      "step": 1075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 672.5,
      "completions/mean_terminated_length": 661.1612548828125,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 3.3209876543209877,
      "grad_norm": 2.0646556550799566,
      "kl": 0.231201171875,
      "learning_rate": 1.3434733630739345e-07,
      "loss": -0.0329,
      "num_tokens": 30487429.0,
      "reward": 0.046139415353536606,
      "reward_std": 0.048572394996881485,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0026549044996500015,
      "rewards/logprob_reward/std": 0.005118787754327059,
      "step": 1076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 717.28125,
      "completions/mean_terminated_length": 696.8333740234375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 3.324074074074074,
      "grad_norm": 2.222405698943655,
      "kl": 0.257080078125,
      "learning_rate": 1.3390434454242704e-07,
      "loss": -0.1534,
      "num_tokens": 30517054.0,
      "reward": 0.051761068403720856,
      "reward_std": 0.03332982957363129,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.00195673992857337,
      "rewards/logprob_reward/std": 0.003259633667767048,
      "step": 1077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 751.625,
      "completions/mean_terminated_length": 723.4483032226562,
      "completions/min_length": 547.0,
      "completions/min_terminated_length": 547.0,
      "epoch": 3.3271604938271606,
      "grad_norm": 1.8774439793909277,
      "kl": 0.217041015625,
      "learning_rate": 1.334618170395254e-07,
      "loss": -0.1122,
      "num_tokens": 30547698.0,
      "reward": 0.03618477284908295,
      "reward_std": 0.04643512889742851,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.002010858617722988,
      "rewards/logprob_reward/std": 0.003403040813282132,
      "step": 1078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 683.46875,
      "completions/mean_terminated_length": 672.4838256835938,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 3.330246913580247,
      "grad_norm": 2.3012361545215336,
      "kl": 0.24853515625,
      "learning_rate": 1.3301975556833872e-07,
      "loss": -0.2881,
      "num_tokens": 30575953.0,
      "reward": 0.04187607765197754,
      "reward_std": 0.04695266857743263,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0013900839257985353,
      "rewards/logprob_reward/std": 0.0022216225042939186,
      "step": 1079
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 734.90625,
      "completions/mean_terminated_length": 715.6333618164062,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 3.3333333333333335,
      "grad_norm": 2.102076853054117,
      "kl": 0.223388671875,
      "learning_rate": 1.3257816189665398e-07,
      "loss": -0.2217,
      "num_tokens": 30605794.0,
      "reward": 0.03226393833756447,
      "reward_std": 0.04563473165035248,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0011265964712947607,
      "rewards/logprob_reward/std": 0.002209179336205125,
      "step": 1080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 707.59375,
      "completions/mean_terminated_length": 697.3870849609375,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 3.3364197530864197,
      "grad_norm": 1.9624133298170543,
      "kl": 0.241943359375,
      "learning_rate": 1.3213703779038726e-07,
      "loss": -0.1519,
      "num_tokens": 30635041.0,
      "reward": 0.04342280328273773,
      "reward_std": 0.04256758093833923,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0031086672097444534,
      "rewards/logprob_reward/std": 0.007012155372649431,
      "step": 1081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 760.5625,
      "completions/mean_terminated_length": 733.3103637695312,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 3.3395061728395063,
      "grad_norm": 1.867376771617639,
      "kl": 0.22021484375,
      "learning_rate": 1.3169638501357697e-07,
      "loss": -0.1062,
      "num_tokens": 30666043.0,
      "reward": 0.020213942974805832,
      "reward_std": 0.0332123339176178,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0016266022576019168,
      "rewards/logprob_reward/std": 0.0027723326347768307,
      "step": 1082
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 731.28125,
      "completions/mean_terminated_length": 689.4642944335938,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 3.3425925925925926,
      "grad_norm": 2.2493488637163948,
      "kl": 0.2119140625,
      "learning_rate": 1.3125620532837667e-07,
      "loss": -0.2133,
      "num_tokens": 30695724.0,
      "reward": 0.03854992985725403,
      "reward_std": 0.047263868153095245,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0011665882775560021,
      "rewards/logprob_reward/std": 0.002320114290341735,
      "step": 1083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 711.5,
      "completions/mean_terminated_length": 690.6666870117188,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 3.3456790123456788,
      "grad_norm": 1.7566383739325062,
      "kl": 0.234375,
      "learning_rate": 1.3081650049504784e-07,
      "loss": -0.2941,
      "num_tokens": 30724772.0,
      "reward": 0.05443764105439186,
      "reward_std": 0.046541810035705566,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001458491780795157,
      "rewards/logprob_reward/std": 0.0019860758911818266,
      "step": 1084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 677.4375,
      "completions/mean_terminated_length": 666.258056640625,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 3.3487654320987654,
      "grad_norm": 2.144118656152997,
      "kl": 0.2432861328125,
      "learning_rate": 1.3037727227195333e-07,
      "loss": -0.0905,
      "num_tokens": 30752434.0,
      "reward": 0.05798827111721039,
      "reward_std": 0.04746585339307785,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0019314105156809092,
      "rewards/logprob_reward/std": 0.0033570851664990187,
      "step": 1085
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 710.09375,
      "completions/mean_terminated_length": 637.6538696289062,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 3.351851851851852,
      "grad_norm": 2.748219497473315,
      "kl": NaN,
      "learning_rate": 1.2993852241554986e-07,
      "loss": -0.259,
      "num_tokens": 30781593.0,
      "reward": 0.035829655826091766,
      "reward_std": 0.047223057597875595,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.00161628401838243,
      "rewards/logprob_reward/std": 0.0030803345143795013,
      "step": 1086
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 709.71875,
      "completions/mean_terminated_length": 688.7667236328125,
      "completions/min_length": 508.0,
      "completions/min_terminated_length": 508.0,
      "epoch": 3.3549382716049383,
      "grad_norm": 2.1144851623470213,
      "kl": 0.2332763671875,
      "learning_rate": 1.295002526803813e-07,
      "loss": -0.1069,
      "num_tokens": 30810588.0,
      "reward": 0.03129696473479271,
      "reward_std": 0.03686773404479027,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0035244047176092863,
      "rewards/logprob_reward/std": 0.006848264019936323,
      "step": 1087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 715.375,
      "completions/mean_terminated_length": 694.800048828125,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 3.3580246913580245,
      "grad_norm": 2.9618576730531583,
      "kl": 0.242919921875,
      "learning_rate": 1.2906246481907145e-07,
      "loss": -0.4702,
      "num_tokens": 30839848.0,
      "reward": 0.03286594897508621,
      "reward_std": 0.04149939864873886,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0017954995855689049,
      "rewards/logprob_reward/std": 0.0030578349251300097,
      "step": 1088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 790.3125,
      "completions/mean_terminated_length": 724.8800048828125,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 3.361111111111111,
      "grad_norm": 4.720990537666491,
      "kl": 0.255859375,
      "learning_rate": 1.2862516058231718e-07,
      "loss": -0.4049,
      "num_tokens": 30871898.0,
      "reward": 0.03222402185201645,
      "reward_std": 0.0213757511228323,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.001082246657460928,
      "rewards/logprob_reward/std": 0.00286446837708354,
      "step": 1089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 785.15625,
      "completions/mean_terminated_length": 718.2799682617188,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 3.3641975308641974,
      "grad_norm": 3.713024794346732,
      "kl": 0.2628173828125,
      "learning_rate": 1.2818834171888136e-07,
      "loss": -0.2556,
      "num_tokens": 30903675.0,
      "reward": 0.022724945098161697,
      "reward_std": 0.02004055865108967,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0009443819872103631,
      "rewards/logprob_reward/std": 0.002085645915940404,
      "step": 1090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 691.09375,
      "completions/mean_terminated_length": 680.3547973632812,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 3.367283950617284,
      "grad_norm": 2.197632557539477,
      "kl": 0.2286376953125,
      "learning_rate": 1.277520099755857e-07,
      "loss": -0.1771,
      "num_tokens": 30932190.0,
      "reward": 0.03833567351102829,
      "reward_std": 0.04985808953642845,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.00440074922516942,
      "rewards/logprob_reward/std": 0.00975917000323534,
      "step": 1091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 756.5,
      "completions/mean_terminated_length": 728.8275756835938,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 3.3703703703703702,
      "grad_norm": 2.326730656353155,
      "kl": 0.2303466796875,
      "learning_rate": 1.2731616709730428e-07,
      "loss": -0.2801,
      "num_tokens": 30962786.0,
      "reward": 0.03941354900598526,
      "reward_std": 0.04291396588087082,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0021261684596538544,
      "rewards/logprob_reward/std": 0.0030039141420274973,
      "step": 1092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 700.125,
      "completions/mean_terminated_length": 653.857177734375,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 3.373456790123457,
      "grad_norm": 2.203955018923035,
      "kl": 0.241943359375,
      "learning_rate": 1.2688081482695577e-07,
      "loss": -0.1841,
      "num_tokens": 30991902.0,
      "reward": 0.03998091444373131,
      "reward_std": 0.04864718019962311,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0027565700002014637,
      "rewards/logprob_reward/std": 0.0038286775816231966,
      "step": 1093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 720.25,
      "completions/mean_terminated_length": 664.0,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 3.376543209876543,
      "grad_norm": 2.854076959202184,
      "kl": 0.2431640625,
      "learning_rate": 1.264459549054973e-07,
      "loss": -0.2387,
      "num_tokens": 31021362.0,
      "reward": 0.03293699026107788,
      "reward_std": 0.04116933047771454,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.001874431036412716,
      "rewards/logprob_reward/std": 0.0030146704521030188,
      "step": 1094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 712.3125,
      "completions/mean_terminated_length": 691.5333862304688,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 3.3796296296296298,
      "grad_norm": 2.180109317404519,
      "kl": 0.2454833984375,
      "learning_rate": 1.2601158907191696e-07,
      "loss": -0.2958,
      "num_tokens": 31050376.0,
      "reward": 0.048220813274383545,
      "reward_std": 0.053947970271110535,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001495343865826726,
      "rewards/logprob_reward/std": 0.0025552199222147465,
      "step": 1095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 980.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 685.65625,
      "completions/mean_terminated_length": 685.65625,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 3.382716049382716,
      "grad_norm": 1.9683328499369184,
      "kl": 0.2362060546875,
      "learning_rate": 1.2557771906322704e-07,
      "loss": -0.0929,
      "num_tokens": 31078785.0,
      "reward": 0.03360357880592346,
      "reward_std": 0.04289977252483368,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0026150825433433056,
      "rewards/logprob_reward/std": 0.005456762854009867,
      "step": 1096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 717.5625,
      "completions/mean_terminated_length": 707.6773681640625,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 3.3858024691358026,
      "grad_norm": 2.15758395020318,
      "kl": 0.2281494140625,
      "learning_rate": 1.2514434661445706e-07,
      "loss": -0.1629,
      "num_tokens": 31108331.0,
      "reward": 0.042262203991413116,
      "reward_std": 0.049088455736637115,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.001819114200770855,
      "rewards/logprob_reward/std": 0.003913812804967165,
      "step": 1097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 774.46875,
      "completions/mean_terminated_length": 748.6551513671875,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 3.388888888888889,
      "grad_norm": 3.5569402046661818,
      "kl": 0.2369384765625,
      "learning_rate": 1.2471147345864672e-07,
      "loss": -0.5862,
      "num_tokens": 31139710.0,
      "reward": 0.041524291038513184,
      "reward_std": 0.05353492870926857,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0009992129635065794,
      "rewards/logprob_reward/std": 0.0018569540698081255,
      "step": 1098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 680.84375,
      "completions/mean_terminated_length": 669.774169921875,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 3.3919753086419755,
      "grad_norm": 1.8845426581783664,
      "kl": 0.235595703125,
      "learning_rate": 1.2427910132683928e-07,
      "loss": -0.0689,
      "num_tokens": 31167677.0,
      "reward": 0.05112278461456299,
      "reward_std": 0.04662879928946495,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0012475401163101196,
      "rewards/logprob_reward/std": 0.0020419848151504993,
      "step": 1099
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 789.28125,
      "completions/mean_terminated_length": 781.7096557617188,
      "completions/min_length": 540.0,
      "completions/min_terminated_length": 540.0,
      "epoch": 3.3950617283950617,
      "grad_norm": 1.964521616890569,
      "kl": 0.2049560546875,
      "learning_rate": 1.2384723194807408e-07,
      "loss": -0.2422,
      "num_tokens": 31199510.0,
      "reward": 0.05434644967317581,
      "reward_std": 0.04874827712774277,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0013571643503382802,
      "rewards/logprob_reward/std": 0.0026882714591920376,
      "step": 1100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 647.9375,
      "completions/mean_terminated_length": 635.8064575195312,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 3.398148148148148,
      "grad_norm": 2.907604830438677,
      "kl": 0.2667236328125,
      "learning_rate": 1.234158670493803e-07,
      "loss": -0.226,
      "num_tokens": 31226296.0,
      "reward": 0.022591644898056984,
      "reward_std": 0.038996174931526184,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0007962724776007235,
      "rewards/logprob_reward/std": 0.0012763147242367268,
      "step": 1101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 711.0625,
      "completions/mean_terminated_length": 666.357177734375,
      "completions/min_length": 499.0,
      "completions/min_terminated_length": 499.0,
      "epoch": 3.4012345679012346,
      "grad_norm": 2.917099739070145,
      "kl": 0.2615966796875,
      "learning_rate": 1.229850083557695e-07,
      "loss": -0.374,
      "num_tokens": 31255282.0,
      "reward": 0.04276447743177414,
      "reward_std": 0.05232639238238335,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002377194818109274,
      "rewards/logprob_reward/std": 0.0037482907064259052,
      "step": 1102
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 967.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 709.46875,
      "completions/mean_terminated_length": 709.46875,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 3.4043209876543212,
      "grad_norm": 2.104831814389895,
      "kl": NaN,
      "learning_rate": 1.2255465759022913e-07,
      "loss": -0.1829,
      "num_tokens": 31284433.0,
      "reward": 0.025428790599107742,
      "reward_std": 0.03470895066857338,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.00047643345897085965,
      "rewards/logprob_reward/std": 0.0010723703308030963,
      "step": 1103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1015.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 734.65625,
      "completions/mean_terminated_length": 734.65625,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 3.4074074074074074,
      "grad_norm": 1.9420649150287443,
      "kl": 0.2213134765625,
      "learning_rate": 1.2212481647371542e-07,
      "loss": -0.1529,
      "num_tokens": 31314102.0,
      "reward": 0.041694849729537964,
      "reward_std": 0.0320030152797699,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0011887189466506243,
      "rewards/logprob_reward/std": 0.0022691090125590563,
      "step": 1104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 729.34375,
      "completions/mean_terminated_length": 698.862060546875,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 3.4104938271604937,
      "grad_norm": 2.578077322286555,
      "kl": 0.22021484375,
      "learning_rate": 1.2169548672514625e-07,
      "loss": -0.2975,
      "num_tokens": 31343713.0,
      "reward": 0.035099804401397705,
      "reward_std": 0.04647810012102127,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0008053354104049504,
      "rewards/logprob_reward/std": 0.0015714397886767983,
      "step": 1105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 753.03125,
      "completions/mean_terminated_length": 744.290283203125,
      "completions/min_length": 552.0,
      "completions/min_terminated_length": 552.0,
      "epoch": 3.4135802469135803,
      "grad_norm": 2.1314975986171705,
      "kl": 0.1986083984375,
      "learning_rate": 1.2126667006139495e-07,
      "loss": -0.076,
      "num_tokens": 31374750.0,
      "reward": 0.052901607006788254,
      "reward_std": 0.05452405661344528,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0032240066211670637,
      "rewards/logprob_reward/std": 0.006283770315349102,
      "step": 1106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 722.0625,
      "completions/mean_terminated_length": 690.8275756835938,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 3.4166666666666665,
      "grad_norm": 1.9101388093195726,
      "kl": 0.214111328125,
      "learning_rate": 1.208383681972829e-07,
      "loss": -0.1831,
      "num_tokens": 31404168.0,
      "reward": 0.053703077137470245,
      "reward_std": 0.04286354407668114,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.004114528186619282,
      "rewards/logprob_reward/std": 0.004552820231765509,
      "step": 1107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 697.0,
      "completions/mean_terminated_length": 686.4515991210938,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 3.419753086419753,
      "grad_norm": 2.826444952568141,
      "kl": 0.2506103515625,
      "learning_rate": 1.2041058284557277e-07,
      "loss": -0.1742,
      "num_tokens": 31432660.0,
      "reward": 0.03298354148864746,
      "reward_std": 0.04296446591615677,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0019261582056060433,
      "rewards/logprob_reward/std": 0.0031893160194158554,
      "step": 1108
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 783.125,
      "completions/mean_terminated_length": 758.2069091796875,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 3.4228395061728394,
      "grad_norm": 2.1305928869403727,
      "kl": NaN,
      "learning_rate": 1.1998331571696162e-07,
      "loss": -0.0304,
      "num_tokens": 31465048.0,
      "reward": 0.03501129895448685,
      "reward_std": 0.0384102463722229,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.004179219249635935,
      "rewards/logprob_reward/std": 0.016072003170847893,
      "step": 1109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 676.03125,
      "completions/mean_terminated_length": 652.8333740234375,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 3.425925925925926,
      "grad_norm": 2.4360467739943394,
      "kl": 0.2364501953125,
      "learning_rate": 1.1955656852007438e-07,
      "loss": -0.1763,
      "num_tokens": 31493377.0,
      "reward": 0.04730891436338425,
      "reward_std": 0.04944960027933121,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00395435094833374,
      "rewards/logprob_reward/std": 0.011206968687474728,
      "step": 1110
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1013.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 694.46875,
      "completions/mean_terminated_length": 694.46875,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 3.4290123456790123,
      "grad_norm": 2.8233126079081936,
      "kl": NaN,
      "learning_rate": 1.1913034296145669e-07,
      "loss": -0.3677,
      "num_tokens": 31521904.0,
      "reward": 0.045856691896915436,
      "reward_std": 0.04630706459283829,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0023407696280628443,
      "rewards/logprob_reward/std": 0.0050558545626699924,
      "step": 1111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 667.53125,
      "completions/mean_terminated_length": 643.7667236328125,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 3.432098765432099,
      "grad_norm": 2.028474611018986,
      "kl": 0.2296142578125,
      "learning_rate": 1.1870464074556816e-07,
      "loss": -0.0693,
      "num_tokens": 31549281.0,
      "reward": 0.03846452385187149,
      "reward_std": 0.04738791286945343,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0010716933757066727,
      "rewards/logprob_reward/std": 0.001394196879118681,
      "step": 1112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 721.375,
      "completions/mean_terminated_length": 711.6128540039062,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 3.435185185185185,
      "grad_norm": 2.105570556456589,
      "kl": 0.242919921875,
      "learning_rate": 1.1827946357477559e-07,
      "loss": -0.1779,
      "num_tokens": 31579001.0,
      "reward": 0.03864138573408127,
      "reward_std": 0.04990891367197037,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.00126820825971663,
      "rewards/logprob_reward/std": 0.0032743855845183134,
      "step": 1113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 651.28125,
      "completions/mean_terminated_length": 639.258056640625,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 3.4382716049382718,
      "grad_norm": 2.030575097798009,
      "kl": 0.2626953125,
      "learning_rate": 1.1785481314934618e-07,
      "loss": -0.2432,
      "num_tokens": 31606130.0,
      "reward": 0.06310325860977173,
      "reward_std": 0.047171108424663544,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0006702886312268674,
      "rewards/logprob_reward/std": 0.0013811569660902023,
      "step": 1114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 735.125,
      "completions/mean_terminated_length": 725.8064575195312,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.441358024691358,
      "grad_norm": 2.0024161143975756,
      "kl": 0.226806640625,
      "learning_rate": 1.1743069116744064e-07,
      "loss": -0.0735,
      "num_tokens": 31636686.0,
      "reward": 0.036736417561769485,
      "reward_std": 0.04932186007499695,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0026237964630126953,
      "rewards/logprob_reward/std": 0.003783321939408779,
      "step": 1115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 720.9375,
      "completions/mean_terminated_length": 700.7333984375,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 3.4444444444444446,
      "grad_norm": 1.6023836770894058,
      "kl": 0.2440185546875,
      "learning_rate": 1.1700709932510656e-07,
      "loss": -0.0023,
      "num_tokens": 31666408.0,
      "reward": 0.026854412630200386,
      "reward_std": 0.03465839475393295,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0020604589954018593,
      "rewards/logprob_reward/std": 0.0032155991066247225,
      "step": 1116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 771.84375,
      "completions/mean_terminated_length": 745.7586059570312,
      "completions/min_length": 594.0,
      "completions/min_terminated_length": 594.0,
      "epoch": 3.447530864197531,
      "grad_norm": 2.294279853209709,
      "kl": 0.2257080078125,
      "learning_rate": 1.1658403931627125e-07,
      "loss": -0.2441,
      "num_tokens": 31697615.0,
      "reward": 0.032512031495571136,
      "reward_std": 0.03311271220445633,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0014022570103406906,
      "rewards/logprob_reward/std": 0.0018350256141275167,
      "step": 1117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 669.78125,
      "completions/mean_terminated_length": 658.3547973632812,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 3.450617283950617,
      "grad_norm": 1.8916531530781802,
      "kl": 0.2376708984375,
      "learning_rate": 1.1616151283273565e-07,
      "loss": -0.0906,
      "num_tokens": 31725732.0,
      "reward": 0.03587503358721733,
      "reward_std": 0.039832498878240585,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0016667036106809974,
      "rewards/logprob_reward/std": 0.002389201894402504,
      "step": 1118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1005.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 745.28125,
      "completions/mean_terminated_length": 745.28125,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 3.4537037037037037,
      "grad_norm": 2.1417357408168645,
      "kl": 0.2279052734375,
      "learning_rate": 1.1573952156416672e-07,
      "loss": -0.1894,
      "num_tokens": 31756221.0,
      "reward": 0.04853258654475212,
      "reward_std": 0.04154781997203827,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0018417645478621125,
      "rewards/logprob_reward/std": 0.0029962125699967146,
      "step": 1119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 724.34375,
      "completions/mean_terminated_length": 693.3448486328125,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 3.45679012345679,
      "grad_norm": 2.1271526220525567,
      "kl": 0.234130859375,
      "learning_rate": 1.1531806719809142e-07,
      "loss": -0.1871,
      "num_tokens": 31786008.0,
      "reward": 0.04200819879770279,
      "reward_std": 0.04148890823125839,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015368824824690819,
      "rewards/logprob_reward/std": 0.002423949772492051,
      "step": 1120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 713.0,
      "completions/mean_terminated_length": 692.2667236328125,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 3.4598765432098766,
      "grad_norm": 1.8761988556721207,
      "kl": 0.2294921875,
      "learning_rate": 1.1489715141988954e-07,
      "loss": -0.1463,
      "num_tokens": 31815312.0,
      "reward": 0.038898445665836334,
      "reward_std": 0.04714834690093994,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0015538274310529232,
      "rewards/logprob_reward/std": 0.0023342163767665625,
      "step": 1121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 730.15625,
      "completions/mean_terminated_length": 699.7586059570312,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 3.462962962962963,
      "grad_norm": 2.6330904755628635,
      "kl": 0.2239990234375,
      "learning_rate": 1.1447677591278715e-07,
      "loss": -0.1677,
      "num_tokens": 31844745.0,
      "reward": 0.035472720861434937,
      "reward_std": 0.04140324890613556,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0012196903117001057,
      "rewards/logprob_reward/std": 0.001673478283919394,
      "step": 1122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 988.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 678.4375,
      "completions/mean_terminated_length": 678.4375,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 3.4660493827160495,
      "grad_norm": 1.8465228167080314,
      "kl": 0.2327880859375,
      "learning_rate": 1.1405694235784972e-07,
      "loss": -0.075,
      "num_tokens": 31872487.0,
      "reward": 0.06046842038631439,
      "reward_std": 0.05312386155128479,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0012149163521826267,
      "rewards/logprob_reward/std": 0.0021992295514792204,
      "step": 1123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 763.71875,
      "completions/mean_terminated_length": 715.5184936523438,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 3.4691358024691357,
      "grad_norm": 2.304488976219399,
      "kl": 0.2335205078125,
      "learning_rate": 1.1363765243397555e-07,
      "loss": -0.1157,
      "num_tokens": 31903734.0,
      "reward": 0.02951209619641304,
      "reward_std": 0.03832588717341423,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0015412173233926296,
      "rewards/logprob_reward/std": 0.002513695042580366,
      "step": 1124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 708.5,
      "completions/mean_terminated_length": 698.3225708007812,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 3.4722222222222223,
      "grad_norm": 2.0540712434195814,
      "kl": 0.2178955078125,
      "learning_rate": 1.1321890781788884e-07,
      "loss": -0.1512,
      "num_tokens": 31933194.0,
      "reward": 0.05982912331819534,
      "reward_std": 0.05028437077999115,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003976802341639996,
      "rewards/logprob_reward/std": 0.004956958349794149,
      "step": 1125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 688.40625,
      "completions/mean_terminated_length": 666.0333862304688,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 3.4753086419753085,
      "grad_norm": 2.110734419144824,
      "kl": 0.2159423828125,
      "learning_rate": 1.1280071018413326e-07,
      "loss": -0.2126,
      "num_tokens": 31961363.0,
      "reward": 0.044660262763500214,
      "reward_std": 0.047581709921360016,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0010113996686413884,
      "rewards/logprob_reward/std": 0.0016988972201943398,
      "step": 1126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 992.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 681.59375,
      "completions/mean_terminated_length": 681.59375,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 3.478395061728395,
      "grad_norm": 1.889477444872169,
      "kl": 0.2265625,
      "learning_rate": 1.1238306120506505e-07,
      "loss": -0.1439,
      "num_tokens": 31989490.0,
      "reward": 0.0637340098619461,
      "reward_std": 0.04231216013431549,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0013711245264858007,
      "rewards/logprob_reward/std": 0.0019430826650932431,
      "step": 1127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 724.5625,
      "completions/mean_terminated_length": 714.9031982421875,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 3.4814814814814814,
      "grad_norm": 2.9944010607198024,
      "kl": 0.232666015625,
      "learning_rate": 1.1196596255084648e-07,
      "loss": -0.2672,
      "num_tokens": 32019300.0,
      "reward": 0.038902007043361664,
      "reward_std": 0.03860536217689514,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.00155778625048697,
      "rewards/logprob_reward/std": 0.0023658720310777426,
      "step": 1128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 710.40625,
      "completions/mean_terminated_length": 689.5000610351562,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 3.484567901234568,
      "grad_norm": 2.3202481339552774,
      "kl": 0.21337890625,
      "learning_rate": 1.11549415889439e-07,
      "loss": -0.1087,
      "num_tokens": 32048165.0,
      "reward": 0.035354673862457275,
      "reward_std": 0.04831884428858757,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0010885288938879967,
      "rewards/logprob_reward/std": 0.0017892775358632207,
      "step": 1129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 738.46875,
      "completions/mean_terminated_length": 719.433349609375,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 3.4876543209876543,
      "grad_norm": 1.9789032421868742,
      "kl": 0.2420654296875,
      "learning_rate": 1.1113342288659683e-07,
      "loss": -0.0749,
      "num_tokens": 32078748.0,
      "reward": 0.030769433826208115,
      "reward_std": 0.02733692154288292,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.002938260091468692,
      "rewards/logprob_reward/std": 0.004040616098791361,
      "step": 1130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 997.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 700.90625,
      "completions/mean_terminated_length": 700.90625,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 3.490740740740741,
      "grad_norm": 2.187451897590174,
      "kl": 0.2154541015625,
      "learning_rate": 1.1071798520585979e-07,
      "loss": -0.1702,
      "num_tokens": 32107581.0,
      "reward": 0.06450729072093964,
      "reward_std": 0.050118058919906616,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0022303247824311256,
      "rewards/logprob_reward/std": 0.0029016912449151278,
      "step": 1131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 770.84375,
      "completions/mean_terminated_length": 753.9667358398438,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 3.493827160493827,
      "grad_norm": 1.7180303555451324,
      "kl": 0.2032470703125,
      "learning_rate": 1.1030310450854729e-07,
      "loss": -0.2091,
      "num_tokens": 32138784.0,
      "reward": 0.029544800519943237,
      "reward_std": 0.03377370163798332,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0015775556676089764,
      "rewards/logprob_reward/std": 0.0020715794526040554,
      "step": 1132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 706.4375,
      "completions/mean_terminated_length": 673.586181640625,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 3.496913580246914,
      "grad_norm": 2.8185615779128224,
      "kl": 0.2470703125,
      "learning_rate": 1.0988878245375138e-07,
      "loss": -0.4412,
      "num_tokens": 32168162.0,
      "reward": 0.06334048509597778,
      "reward_std": 0.028165031224489212,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.004406091757118702,
      "rewards/logprob_reward/std": 0.004128764383494854,
      "step": 1133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 665.96875,
      "completions/mean_terminated_length": 654.4193115234375,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 3.5,
      "grad_norm": 2.2747570750255672,
      "kl": 0.2449951171875,
      "learning_rate": 1.094750206983299e-07,
      "loss": -0.0658,
      "num_tokens": 32195185.0,
      "reward": 0.05119810998439789,
      "reward_std": 0.04677068069577217,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0013312329538166523,
      "rewards/logprob_reward/std": 0.0021630190312862396,
      "step": 1134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 787.46875,
      "completions/mean_terminated_length": 743.6666870117188,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 3.503086419753086,
      "grad_norm": 1.8363172237592025,
      "kl": 0.202880859375,
      "learning_rate": 1.0906182089690025e-07,
      "loss": -0.0546,
      "num_tokens": 32227168.0,
      "reward": 0.04166097193956375,
      "reward_std": 0.02731870487332344,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0011510797776281834,
      "rewards/logprob_reward/std": 0.002137968083843589,
      "step": 1135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 840.0,
      "completions/max_terminated_length": 840.0,
      "completions/mean_length": 635.125,
      "completions/mean_terminated_length": 635.125,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 3.506172839506173,
      "grad_norm": 2.1241744368102093,
      "kl": 0.266845703125,
      "learning_rate": 1.0864918470183258e-07,
      "loss": -0.266,
      "num_tokens": 32253412.0,
      "reward": 0.05826055258512497,
      "reward_std": 0.055479831993579865,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002233944833278656,
      "rewards/logprob_reward/std": 0.004511518403887749,
      "step": 1136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 701.3125,
      "completions/mean_terminated_length": 679.800048828125,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 3.5092592592592595,
      "grad_norm": 2.6466497665544155,
      "kl": 0.233642578125,
      "learning_rate": 1.0823711376324313e-07,
      "loss": -0.1377,
      "num_tokens": 32282082.0,
      "reward": 0.05135006457567215,
      "reward_std": 0.04925466328859329,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0015000696294009686,
      "rewards/logprob_reward/std": 0.0025531058199703693,
      "step": 1137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 747.3125,
      "completions/mean_terminated_length": 738.3870849609375,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 3.5123456790123457,
      "grad_norm": 3.4678811944566195,
      "kl": 0.2003173828125,
      "learning_rate": 1.0782560972898783e-07,
      "loss": -0.4641,
      "num_tokens": 32312304.0,
      "reward": 0.04236939549446106,
      "reward_std": 0.05416145175695419,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0019382155733183026,
      "rewards/logprob_reward/std": 0.0023308454547077417,
      "step": 1138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 785.09375,
      "completions/mean_terminated_length": 760.3793334960938,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 3.515432098765432,
      "grad_norm": 2.3914373290431157,
      "kl": 0.18701171875,
      "learning_rate": 1.0741467424465544e-07,
      "loss": -0.1969,
      "num_tokens": 32343919.0,
      "reward": 0.025426620617508888,
      "reward_std": 0.04071640968322754,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.003946245182305574,
      "rewards/logprob_reward/std": 0.007611100561916828,
      "step": 1139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 736.34375,
      "completions/mean_terminated_length": 706.586181640625,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 3.5185185185185186,
      "grad_norm": 1.8549279697319916,
      "kl": 0.2083740234375,
      "learning_rate": 1.0700430895356119e-07,
      "loss": -0.0428,
      "num_tokens": 32374274.0,
      "reward": 0.05179382860660553,
      "reward_std": 0.03939950466156006,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0019931443966925144,
      "rewards/logprob_reward/std": 0.00250457925722003,
      "step": 1140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 682.5,
      "completions/mean_terminated_length": 659.7333374023438,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 3.521604938271605,
      "grad_norm": 2.1549472574588724,
      "kl": 0.2467041015625,
      "learning_rate": 1.0659451549674018e-07,
      "loss": -0.201,
      "num_tokens": 32402554.0,
      "reward": 0.04312340170145035,
      "reward_std": 0.04689634591341019,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002776003908365965,
      "rewards/logprob_reward/std": 0.004801588132977486,
      "step": 1141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 727.40625,
      "completions/mean_terminated_length": 696.72412109375,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 3.5246913580246915,
      "grad_norm": 2.0764939972234004,
      "kl": 0.214111328125,
      "learning_rate": 1.0618529551294053e-07,
      "loss": -0.0672,
      "num_tokens": 32432175.0,
      "reward": 0.03544144332408905,
      "reward_std": 0.04791321977972984,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0011849362635985017,
      "rewards/logprob_reward/std": 0.002227792516350746,
      "step": 1142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 707.90625,
      "completions/mean_terminated_length": 675.2069091796875,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 3.5277777777777777,
      "grad_norm": 2.093093067534423,
      "kl": 0.223388671875,
      "learning_rate": 1.0577665063861735e-07,
      "loss": -0.154,
      "num_tokens": 32461232.0,
      "reward": 0.05621867626905441,
      "reward_std": 0.05492142587900162,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.01732630468904972,
      "rewards/logprob_reward/std": 0.04858780652284622,
      "step": 1143
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 753.9375,
      "completions/mean_terminated_length": 715.357177734375,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 3.5308641975308643,
      "grad_norm": 1.7649380791762947,
      "kl": NaN,
      "learning_rate": 1.0536858250792582e-07,
      "loss": -0.1807,
      "num_tokens": 32492118.0,
      "reward": 0.03603683412075043,
      "reward_std": 0.03452879190444946,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.001846479601226747,
      "rewards/logprob_reward/std": 0.0036165211349725723,
      "step": 1144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 663.96875,
      "completions/mean_terminated_length": 652.3547973632812,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 3.5339506172839505,
      "grad_norm": 1.8531141622542997,
      "kl": 0.2381591796875,
      "learning_rate": 1.0496109275271456e-07,
      "loss": -0.0774,
      "num_tokens": 32519561.0,
      "reward": 0.054988741874694824,
      "reward_std": 0.041676297783851624,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0020708239171653986,
      "rewards/logprob_reward/std": 0.0025657275691628456,
      "step": 1145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 759.625,
      "completions/mean_terminated_length": 742.0000610351562,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 3.537037037037037,
      "grad_norm": 2.59980971611076,
      "kl": 0.19775390625,
      "learning_rate": 1.0455418300251953e-07,
      "loss": -0.2394,
      "num_tokens": 32550661.0,
      "reward": 0.02973213419318199,
      "reward_std": 0.0364789180457592,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0017857032362371683,
      "rewards/logprob_reward/std": 0.0033551438245922327,
      "step": 1146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 765.15625,
      "completions/mean_terminated_length": 728.1785888671875,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 3.5401234567901234,
      "grad_norm": 3.37755020614535,
      "kl": 0.2130126953125,
      "learning_rate": 1.0414785488455718e-07,
      "loss": -0.4422,
      "num_tokens": 32581882.0,
      "reward": 0.039593394845724106,
      "reward_std": 0.048004984855651855,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002325995359569788,
      "rewards/logprob_reward/std": 0.002745207166299224,
      "step": 1147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 752.4375,
      "completions/mean_terminated_length": 724.3448486328125,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 3.5432098765432096,
      "grad_norm": 2.5484335148892345,
      "kl": 0.201171875,
      "learning_rate": 1.0374211002371808e-07,
      "loss": -0.2391,
      "num_tokens": 32612820.0,
      "reward": 0.03568592667579651,
      "reward_std": 0.01441339310258627,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0014565885066986084,
      "rewards/logprob_reward/std": 0.002635025419294834,
      "step": 1148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 700.59375,
      "completions/mean_terminated_length": 690.1612548828125,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 3.5462962962962963,
      "grad_norm": 2.227758388455807,
      "kl": 0.2174072265625,
      "learning_rate": 1.0333695004256035e-07,
      "loss": -0.1032,
      "num_tokens": 32641467.0,
      "reward": 0.03294813632965088,
      "reward_std": 0.04731585085391998,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.001886819489300251,
      "rewards/logprob_reward/std": 0.003531733760610223,
      "step": 1149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 713.84375,
      "completions/mean_terminated_length": 703.8386840820312,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 3.549382716049383,
      "grad_norm": 2.1803390550057085,
      "kl": 0.23193359375,
      "learning_rate": 1.0293237656130304e-07,
      "loss": -0.1421,
      "num_tokens": 32670570.0,
      "reward": 0.04645146429538727,
      "reward_std": 0.04729815572500229,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0030016263481229544,
      "rewards/logprob_reward/std": 0.005217393394559622,
      "step": 1150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 773.28125,
      "completions/mean_terminated_length": 715.423095703125,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.552469135802469,
      "grad_norm": 2.0905799438129216,
      "kl": 0.245849609375,
      "learning_rate": 1.0252839119782006e-07,
      "loss": -0.1921,
      "num_tokens": 32702179.0,
      "reward": 0.04366558417677879,
      "reward_std": 0.047508589923381805,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.003378424560651183,
      "rewards/logprob_reward/std": 0.005873650312423706,
      "step": 1151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 732.90625,
      "completions/mean_terminated_length": 702.7930908203125,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 3.5555555555555554,
      "grad_norm": 1.834207846084838,
      "kl": 0.21826171875,
      "learning_rate": 1.0212499556763335e-07,
      "loss": -0.0765,
      "num_tokens": 32731848.0,
      "reward": 0.03870991989970207,
      "reward_std": 0.047259245067834854,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.001344357617199421,
      "rewards/logprob_reward/std": 0.002039544750005007,
      "step": 1152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 739.8125,
      "completions/mean_terminated_length": 687.1851806640625,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.558641975308642,
      "grad_norm": 1.70906949737979,
      "kl": 0.2301025390625,
      "learning_rate": 1.017221912839065e-07,
      "loss": -0.079,
      "num_tokens": 32762650.0,
      "reward": 0.02254810370504856,
      "reward_std": 0.028371116146445274,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0007478923653252423,
      "rewards/logprob_reward/std": 0.001637790584936738,
      "step": 1153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 725.84375,
      "completions/mean_terminated_length": 705.9666748046875,
      "completions/min_length": 548.0,
      "completions/min_terminated_length": 548.0,
      "epoch": 3.5617283950617287,
      "grad_norm": 1.8843546376830838,
      "kl": 0.2266845703125,
      "learning_rate": 1.0131997995743838e-07,
      "loss": -0.1384,
      "num_tokens": 32792361.0,
      "reward": 0.029994942247867584,
      "reward_std": 0.04116266965866089,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.002077713841572404,
      "rewards/logprob_reward/std": 0.002635175595059991,
      "step": 1154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 765.75,
      "completions/mean_terminated_length": 728.857177734375,
      "completions/min_length": 515.0,
      "completions/min_terminated_length": 515.0,
      "epoch": 3.564814814814815,
      "grad_norm": 1.9939235544558975,
      "kl": 0.2037353515625,
      "learning_rate": 1.0091836319665664e-07,
      "loss": -0.0247,
      "num_tokens": 32823197.0,
      "reward": 0.046854279935359955,
      "reward_std": 0.05307752639055252,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003449202748015523,
      "rewards/logprob_reward/std": 0.0035610662307590246,
      "step": 1155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 763.8125,
      "completions/mean_terminated_length": 703.769287109375,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 3.567901234567901,
      "grad_norm": 2.127539500228586,
      "kl": 0.226806640625,
      "learning_rate": 1.0051734260761135e-07,
      "loss": -0.0656,
      "num_tokens": 32853927.0,
      "reward": 0.03639114648103714,
      "reward_std": 0.04656834155321121,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.002240160945802927,
      "rewards/logprob_reward/std": 0.003256736323237419,
      "step": 1156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 711.0625,
      "completions/mean_terminated_length": 700.9677124023438,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 3.5709876543209877,
      "grad_norm": 2.2089161764476786,
      "kl": 0.219482421875,
      "learning_rate": 1.0011691979396827e-07,
      "loss": -0.0961,
      "num_tokens": 32883197.0,
      "reward": 0.045687802135944366,
      "reward_std": 0.02866285666823387,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0021531146485358477,
      "rewards/logprob_reward/std": 0.005218836013227701,
      "step": 1157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 733.25,
      "completions/mean_terminated_length": 713.86669921875,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 3.574074074074074,
      "grad_norm": 2.585901571693692,
      "kl": 0.221923828125,
      "learning_rate": 9.971709635700301e-08,
      "loss": -0.2335,
      "num_tokens": 32912889.0,
      "reward": 0.03194165229797363,
      "reward_std": 0.05235572159290314,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0007685050368309021,
      "rewards/logprob_reward/std": 0.0014013643376529217,
      "step": 1158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 778.5625,
      "completions/mean_terminated_length": 721.923095703125,
      "completions/min_length": 548.0,
      "completions/min_terminated_length": 548.0,
      "epoch": 3.5771604938271606,
      "grad_norm": 2.5027511395310427,
      "kl": 0.2294921875,
      "learning_rate": 9.931787389559393e-08,
      "loss": -0.2085,
      "num_tokens": 32944799.0,
      "reward": 0.03651416301727295,
      "reward_std": 0.03996749222278595,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0023768495302647352,
      "rewards/logprob_reward/std": 0.0031294492073357105,
      "step": 1159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 729.65625,
      "completions/mean_terminated_length": 710.0333862304688,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 3.580246913580247,
      "grad_norm": 1.916702283811012,
      "kl": 0.21142578125,
      "learning_rate": 9.891925400621642e-08,
      "loss": -0.0708,
      "num_tokens": 32974408.0,
      "reward": 0.045670777559280396,
      "reward_std": 0.04736469313502312,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0021341973915696144,
      "rewards/logprob_reward/std": 0.003208830486983061,
      "step": 1160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 675.90625,
      "completions/mean_terminated_length": 664.6774291992188,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 3.5833333333333335,
      "grad_norm": 2.309206694151157,
      "kl": 0.2421875,
      "learning_rate": 9.852123828293612e-08,
      "loss": -0.1358,
      "num_tokens": 33002549.0,
      "reward": 0.038855329155921936,
      "reward_std": 0.0476827509701252,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0015059204306453466,
      "rewards/logprob_reward/std": 0.0023971146438270807,
      "step": 1161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 714.90625,
      "completions/mean_terminated_length": 704.9354858398438,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 3.5864197530864197,
      "grad_norm": 2.4419855450292083,
      "kl": 0.24462890625,
      "learning_rate": 9.812382831740259e-08,
      "loss": -0.1761,
      "num_tokens": 33032410.0,
      "reward": 0.03587932139635086,
      "reward_std": 0.05150279775261879,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0016714700032025576,
      "rewards/logprob_reward/std": 0.0032537688966840506,
      "step": 1162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1004.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 688.65625,
      "completions/mean_terminated_length": 688.65625,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 3.5895061728395063,
      "grad_norm": 2.154061690190303,
      "kl": 0.220458984375,
      "learning_rate": 9.772702569884301e-08,
      "loss": -0.1281,
      "num_tokens": 33061115.0,
      "reward": 0.04232947528362274,
      "reward_std": 0.054103728383779526,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.001893858890980482,
      "rewards/logprob_reward/std": 0.00251575093716383,
      "step": 1163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 744.75,
      "completions/mean_terminated_length": 715.862060546875,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 3.5925925925925926,
      "grad_norm": 3.837670779890254,
      "kl": 0.2205810546875,
      "learning_rate": 9.733083201405578e-08,
      "loss": -0.3855,
      "num_tokens": 33091775.0,
      "reward": 0.026944581419229507,
      "reward_std": 0.0369250513613224,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0021606460213661194,
      "rewards/logprob_reward/std": 0.004713218659162521,
      "step": 1164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 740.0,
      "completions/mean_terminated_length": 710.6206665039062,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 3.5956790123456788,
      "grad_norm": 1.7853595308513794,
      "kl": 0.2216796875,
      "learning_rate": 9.693524884740425e-08,
      "loss": -0.0343,
      "num_tokens": 33121851.0,
      "reward": 0.055697403848171234,
      "reward_std": 0.04853183776140213,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0028582289814949036,
      "rewards/logprob_reward/std": 0.0037844711914658546,
      "step": 1165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 745.6875,
      "completions/mean_terminated_length": 705.9285888671875,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 3.5987654320987654,
      "grad_norm": 1.9216443876331215,
      "kl": 0.2117919921875,
      "learning_rate": 9.654027778081042e-08,
      "loss": -0.2153,
      "num_tokens": 33152401.0,
      "reward": 0.04970502480864525,
      "reward_std": 0.04187548905611038,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003144473535940051,
      "rewards/logprob_reward/std": 0.0032250788062810898,
      "step": 1166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 967.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 687.15625,
      "completions/mean_terminated_length": 687.15625,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 3.601851851851852,
      "grad_norm": 2.6335268914583527,
      "kl": 0.237060546875,
      "learning_rate": 9.614592039374817e-08,
      "loss": -0.1578,
      "num_tokens": 33181054.0,
      "reward": 0.04822558909654617,
      "reward_std": 0.05351199582219124,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0015006531029939651,
      "rewards/logprob_reward/std": 0.0025360500440001488,
      "step": 1167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 737.21875,
      "completions/mean_terminated_length": 718.1000366210938,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 3.6049382716049383,
      "grad_norm": 2.108476036100173,
      "kl": 0.2164306640625,
      "learning_rate": 9.575217826323761e-08,
      "loss": -0.1678,
      "num_tokens": 33210873.0,
      "reward": 0.03886102885007858,
      "reward_std": 0.05453537404537201,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0015122548211365938,
      "rewards/logprob_reward/std": 0.002545837312936783,
      "step": 1168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 705.09375,
      "completions/mean_terminated_length": 683.8333740234375,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 3.6080246913580245,
      "grad_norm": 2.093434761242607,
      "kl": 0.220458984375,
      "learning_rate": 9.535905296383848e-08,
      "loss": -0.1648,
      "num_tokens": 33239724.0,
      "reward": 0.04284374788403511,
      "reward_std": 0.05345625802874565,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0024652741849422455,
      "rewards/logprob_reward/std": 0.004442441742867231,
      "step": 1169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 761.84375,
      "completions/mean_terminated_length": 744.36669921875,
      "completions/min_length": 544.0,
      "completions/min_terminated_length": 544.0,
      "epoch": 3.611111111111111,
      "grad_norm": 2.5958651505623527,
      "kl": 0.2073974609375,
      "learning_rate": 9.496654606764373e-08,
      "loss": -0.4292,
      "num_tokens": 33270991.0,
      "reward": 0.03993716090917587,
      "reward_std": 0.040980130434036255,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002707959618419409,
      "rewards/logprob_reward/std": 0.0048716869205236435,
      "step": 1170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 730.34375,
      "completions/mean_terminated_length": 699.9655151367188,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 3.6141975308641974,
      "grad_norm": 2.0841017528140022,
      "kl": 0.2457275390625,
      "learning_rate": 9.457465914427326e-08,
      "loss": -0.1026,
      "num_tokens": 33301178.0,
      "reward": 0.03388774394989014,
      "reward_std": 0.034075312316417694,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002930824179202318,
      "rewards/logprob_reward/std": 0.006740282755345106,
      "step": 1171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 737.0625,
      "completions/mean_terminated_length": 717.933349609375,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.617283950617284,
      "grad_norm": 2.087094990381655,
      "kl": 0.2344970703125,
      "learning_rate": 9.418339376086785e-08,
      "loss": -0.1951,
      "num_tokens": 33331128.0,
      "reward": 0.0337870828807354,
      "reward_std": 0.04293426126241684,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002818981185555458,
      "rewards/logprob_reward/std": 0.0037272456102073193,
      "step": 1172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 710.625,
      "completions/mean_terminated_length": 689.7333984375,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 3.6203703703703702,
      "grad_norm": 2.4310665200823744,
      "kl": 0.2216796875,
      "learning_rate": 9.379275148208276e-08,
      "loss": -0.1287,
      "num_tokens": 33360504.0,
      "reward": 0.033377714455127716,
      "reward_std": 0.03862452134490013,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002364126965403557,
      "rewards/logprob_reward/std": 0.0032782454509288073,
      "step": 1173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 731.0625,
      "completions/mean_terminated_length": 676.8148193359375,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 3.623456790123457,
      "grad_norm": 1.8869975444742286,
      "kl": 0.24365234375,
      "learning_rate": 9.340273387008152e-08,
      "loss": -0.0798,
      "num_tokens": 33390974.0,
      "reward": 0.02314150519669056,
      "reward_std": 0.028033804148435593,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0014072273625060916,
      "rewards/logprob_reward/std": 0.0022848050575703382,
      "step": 1174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 969.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 711.75,
      "completions/mean_terminated_length": 711.75,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 3.626543209876543,
      "grad_norm": 2.198024711927301,
      "kl": 0.2171630859375,
      "learning_rate": 9.30133424845294e-08,
      "loss": -0.1943,
      "num_tokens": 33420078.0,
      "reward": 0.04502446576952934,
      "reward_std": 0.05276203155517578,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0014160738792270422,
      "rewards/logprob_reward/std": 0.0021478289272636175,
      "step": 1175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 694.5,
      "completions/mean_terminated_length": 672.5333862304688,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 3.6296296296296298,
      "grad_norm": 2.229631533463387,
      "kl": 0.2266845703125,
      "learning_rate": 9.26245788825877e-08,
      "loss": -0.1133,
      "num_tokens": 33448962.0,
      "reward": 0.05338180810213089,
      "reward_std": 0.04487844929099083,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0037575638853013515,
      "rewards/logprob_reward/std": 0.005705379415303469,
      "step": 1176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 792.875,
      "completions/mean_terminated_length": 715.8333740234375,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 3.632716049382716,
      "grad_norm": 1.7076910558267198,
      "kl": 0.228515625,
      "learning_rate": 9.223644461890711e-08,
      "loss": 0.0117,
      "num_tokens": 33481466.0,
      "reward": 0.03370305895805359,
      "reward_std": 0.042882040143013,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.002725622383877635,
      "rewards/logprob_reward/std": 0.004249474499374628,
      "step": 1177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 870.0,
      "completions/mean_length": 720.59375,
      "completions/mean_terminated_length": 677.25,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 3.6358024691358026,
      "grad_norm": 1.7917361604252584,
      "kl": 0.2369384765625,
      "learning_rate": 9.184894124562162e-08,
      "loss": -0.0305,
      "num_tokens": 33511005.0,
      "reward": 0.03542282432317734,
      "reward_std": 0.0337524339556694,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0011642478639259934,
      "rewards/logprob_reward/std": 0.0017650318332016468,
      "step": 1178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 707.375,
      "completions/mean_terminated_length": 697.1612548828125,
      "completions/min_length": 333.0,
      "completions/min_terminated_length": 333.0,
      "epoch": 3.638888888888889,
      "grad_norm": 2.7640861439708604,
      "kl": 0.2716064453125,
      "learning_rate": 9.146207031234232e-08,
      "loss": -0.148,
      "num_tokens": 33539557.0,
      "reward": 0.02736113965511322,
      "reward_std": 0.04650455713272095,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0026234870310872793,
      "rewards/logprob_reward/std": 0.0050385938957333565,
      "step": 1179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 678.0,
      "completions/mean_terminated_length": 666.8386840820312,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 3.6419753086419755,
      "grad_norm": 1.9242357904456104,
      "kl": 0.224853515625,
      "learning_rate": 9.107583336615124e-08,
      "loss": -0.1959,
      "num_tokens": 33567653.0,
      "reward": 0.04546947404742241,
      "reward_std": 0.04752422869205475,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0019105253741145134,
      "rewards/logprob_reward/std": 0.0036665520165115595,
      "step": 1180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 787.1875,
      "completions/mean_terminated_length": 771.4000244140625,
      "completions/min_length": 539.0,
      "completions/min_terminated_length": 539.0,
      "epoch": 3.6450617283950617,
      "grad_norm": 2.109700825889162,
      "kl": 0.201904296875,
      "learning_rate": 9.069023195159505e-08,
      "loss": -0.224,
      "num_tokens": 33599795.0,
      "reward": 0.047613054513931274,
      "reward_std": 0.045055896043777466,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.004292279947549105,
      "rewards/logprob_reward/std": 0.011248183436691761,
      "step": 1181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 742.09375,
      "completions/mean_terminated_length": 701.8214721679688,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 3.648148148148148,
      "grad_norm": 1.9162535355414676,
      "kl": 0.210205078125,
      "learning_rate": 9.030526761067911e-08,
      "loss": -0.2148,
      "num_tokens": 33630026.0,
      "reward": 0.039342716336250305,
      "reward_std": 0.0419294573366642,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002047464484348893,
      "rewards/logprob_reward/std": 0.0057477750815451145,
      "step": 1182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 781.875,
      "completions/mean_terminated_length": 737.0370483398438,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 3.6512345679012346,
      "grad_norm": 1.9041115803781818,
      "kl": 0.2135009765625,
      "learning_rate": 8.992094188286081e-08,
      "loss": -0.2172,
      "num_tokens": 33661490.0,
      "reward": 0.047771573066711426,
      "reward_std": 0.04141303151845932,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.000996189541183412,
      "rewards/logprob_reward/std": 0.0018626094097271562,
      "step": 1183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 746.3125,
      "completions/mean_terminated_length": 727.800048828125,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 3.6543209876543212,
      "grad_norm": 2.168200078517223,
      "kl": 0.216552734375,
      "learning_rate": 8.953725630504419e-08,
      "loss": -0.1496,
      "num_tokens": 33692164.0,
      "reward": 0.03015190362930298,
      "reward_std": 0.04134177416563034,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0022521147038787603,
      "rewards/logprob_reward/std": 0.003899810602888465,
      "step": 1184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 661.8125,
      "completions/mean_terminated_length": 650.1290283203125,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 3.6574074074074074,
      "grad_norm": 1.9923295968789287,
      "kl": 0.2342529296875,
      "learning_rate": 8.915421241157292e-08,
      "loss": -0.1572,
      "num_tokens": 33719854.0,
      "reward": 0.06792387366294861,
      "reward_std": 0.049035366624593735,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.006026518531143665,
      "rewards/logprob_reward/std": 0.014248420484364033,
      "step": 1185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 711.90625,
      "completions/mean_terminated_length": 691.1000366210938,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 3.6604938271604937,
      "grad_norm": 2.452255438851298,
      "kl": 0.236328125,
      "learning_rate": 8.877181173422487e-08,
      "loss": -0.2436,
      "num_tokens": 33749215.0,
      "reward": 0.03879091516137123,
      "reward_std": 0.048295870423316956,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.001434349687770009,
      "rewards/logprob_reward/std": 0.002998590236529708,
      "step": 1186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 730.0,
      "completions/mean_terminated_length": 675.5555419921875,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 3.6635802469135803,
      "grad_norm": 1.8964564230924912,
      "kl": 0.2518310546875,
      "learning_rate": 8.839005580220574e-08,
      "loss": -0.156,
      "num_tokens": 33779163.0,
      "reward": 0.051121100783348083,
      "reward_std": 0.027093954384326935,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0012456680415198207,
      "rewards/logprob_reward/std": 0.0023642387241125107,
      "step": 1187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 707.6875,
      "completions/mean_terminated_length": 686.6000366210938,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 3.6666666666666665,
      "grad_norm": 2.698063869656446,
      "kl": 0.2183837890625,
      "learning_rate": 8.800894614214274e-08,
      "loss": -0.1528,
      "num_tokens": 33808297.0,
      "reward": 0.03385056555271149,
      "reward_std": 0.03981088474392891,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0028895149007439613,
      "rewards/logprob_reward/std": 0.00452409265562892,
      "step": 1188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 753.9375,
      "completions/mean_terminated_length": 715.357177734375,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 3.669753086419753,
      "grad_norm": 1.7943247066398242,
      "kl": 0.24609375,
      "learning_rate": 8.762848427807882e-08,
      "loss": -0.0357,
      "num_tokens": 33839319.0,
      "reward": 0.042798519134521484,
      "reward_std": 0.03934928774833679,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0024150179233402014,
      "rewards/logprob_reward/std": 0.005606563296169043,
      "step": 1189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 781.46875,
      "completions/mean_terminated_length": 765.300048828125,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 3.6728395061728394,
      "grad_norm": 1.8116751313581374,
      "kl": 0.2315673828125,
      "learning_rate": 8.724867173146633e-08,
      "loss": -0.0796,
      "num_tokens": 33871646.0,
      "reward": 0.027132708579301834,
      "reward_std": 0.03633095324039459,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.002369676483795047,
      "rewards/logprob_reward/std": 0.0037503838539123535,
      "step": 1190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 768.625,
      "completions/mean_terminated_length": 742.2069091796875,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.675925925925926,
      "grad_norm": 2.0130064927685023,
      "kl": 0.223876953125,
      "learning_rate": 8.686951002116111e-08,
      "loss": -0.2278,
      "num_tokens": 33902506.0,
      "reward": 0.045343153178691864,
      "reward_std": 0.0349864661693573,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017701697070151567,
      "rewards/logprob_reward/std": 0.0032237397972494364,
      "step": 1191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 754.375,
      "completions/mean_terminated_length": 715.857177734375,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 3.6790123456790123,
      "grad_norm": 2.337435117630004,
      "kl": 0.206298828125,
      "learning_rate": 8.649100066341614e-08,
      "loss": -0.282,
      "num_tokens": 33932814.0,
      "reward": 0.05505530908703804,
      "reward_std": 0.0429777055978775,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0056170071475207806,
      "rewards/logprob_reward/std": 0.011483278125524521,
      "step": 1192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 783.0,
      "completions/mean_terminated_length": 758.0689697265625,
      "completions/min_length": 558.0,
      "completions/min_terminated_length": 558.0,
      "epoch": 3.682098765432099,
      "grad_norm": 1.8540610444487833,
      "kl": 0.2149658203125,
      "learning_rate": 8.611314517187584e-08,
      "loss": -0.1743,
      "num_tokens": 33964362.0,
      "reward": 0.03965158388018608,
      "reward_std": 0.04772069305181503,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0023906445130705833,
      "rewards/logprob_reward/std": 0.002835685620084405,
      "step": 1193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 724.46875,
      "completions/mean_terminated_length": 714.8064575195312,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 3.685185185185185,
      "grad_norm": 2.0576603816592525,
      "kl": 0.223388671875,
      "learning_rate": 8.573594505756982e-08,
      "loss": -0.1572,
      "num_tokens": 33993769.0,
      "reward": 0.03854009136557579,
      "reward_std": 0.04036765918135643,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0011556560639292002,
      "rewards/logprob_reward/std": 0.002103740582242608,
      "step": 1194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 751.46875,
      "completions/mean_terminated_length": 742.6773681640625,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 3.6882716049382713,
      "grad_norm": 2.183602194642444,
      "kl": 0.21484375,
      "learning_rate": 8.535940182890685e-08,
      "loss": -0.2377,
      "num_tokens": 34024244.0,
      "reward": 0.04226265847682953,
      "reward_std": 0.0493067130446434,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0018196202581748366,
      "rewards/logprob_reward/std": 0.003216702723875642,
      "step": 1195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 705.4375,
      "completions/mean_terminated_length": 684.2000122070312,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 3.691358024691358,
      "grad_norm": 1.9032977973811886,
      "kl": 0.231201171875,
      "learning_rate": 8.498351699166889e-08,
      "loss": -0.1241,
      "num_tokens": 34053138.0,
      "reward": 0.042191024869680405,
      "reward_std": 0.043851274996995926,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.001740025938488543,
      "rewards/logprob_reward/std": 0.0024247795809060335,
      "step": 1196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 995.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 742.96875,
      "completions/mean_terminated_length": 742.96875,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 3.6944444444444446,
      "grad_norm": 1.9042166827183096,
      "kl": 0.2301025390625,
      "learning_rate": 8.460829204900483e-08,
      "loss": -0.0344,
      "num_tokens": 34083621.0,
      "reward": 0.04693237692117691,
      "reward_std": 0.03480307385325432,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003535972908139229,
      "rewards/logprob_reward/std": 0.004171703942120075,
      "step": 1197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 807.34375,
      "completions/mean_terminated_length": 757.34619140625,
      "completions/min_length": 528.0,
      "completions/min_terminated_length": 528.0,
      "epoch": 3.697530864197531,
      "grad_norm": 1.9783222280287667,
      "kl": 0.213623046875,
      "learning_rate": 8.423372850142482e-08,
      "loss": -0.1585,
      "num_tokens": 34116080.0,
      "reward": 0.029560063034296036,
      "reward_std": 0.034590378403663635,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0015945147024467587,
      "rewards/logprob_reward/std": 0.00318322260864079,
      "step": 1198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 955.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 664.75,
      "completions/mean_terminated_length": 664.75,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 3.700617283950617,
      "grad_norm": 1.9085474594234189,
      "kl": 0.2432861328125,
      "learning_rate": 8.385982784679416e-08,
      "loss": -0.071,
      "num_tokens": 34143472.0,
      "reward": 0.04795948788523674,
      "reward_std": 0.04828812927007675,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0012049854267388582,
      "rewards/logprob_reward/std": 0.0014321227790787816,
      "step": 1199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 676.96875,
      "completions/mean_terminated_length": 653.8333740234375,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 3.7037037037037037,
      "grad_norm": 2.9150387049016016,
      "kl": 0.2626953125,
      "learning_rate": 8.348659158032723e-08,
      "loss": -0.2794,
      "num_tokens": 34171403.0,
      "reward": 0.03862811625003815,
      "reward_std": 0.03850669786334038,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0012534614652395248,
      "rewards/logprob_reward/std": 0.0016249925829470158,
      "step": 1200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 700.84375,
      "completions/mean_terminated_length": 667.413818359375,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.7067901234567904,
      "grad_norm": 2.0905855134396663,
      "kl": 0.2216796875,
      "learning_rate": 8.311402119458138e-08,
      "loss": -0.056,
      "num_tokens": 34200246.0,
      "reward": 0.04225610941648483,
      "reward_std": 0.04913446307182312,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0018123441841453314,
      "rewards/logprob_reward/std": 0.003109812270849943,
      "step": 1201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 755.65625,
      "completions/mean_terminated_length": 737.7667236328125,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 3.7098765432098766,
      "grad_norm": 2.453056232109461,
      "kl": 0.2198486328125,
      "learning_rate": 8.274211817945135e-08,
      "loss": -0.0325,
      "num_tokens": 34230919.0,
      "reward": 0.03255268186330795,
      "reward_std": 0.04033456742763519,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0014474234776571393,
      "rewards/logprob_reward/std": 0.0026141770649701357,
      "step": 1202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1008.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 736.46875,
      "completions/mean_terminated_length": 736.46875,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 3.712962962962963,
      "grad_norm": 2.251102088294818,
      "kl": 0.226806640625,
      "learning_rate": 8.237088402216297e-08,
      "loss": -0.1966,
      "num_tokens": 34261242.0,
      "reward": 0.024148931726813316,
      "reward_std": 0.03446749225258827,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0025265919975936413,
      "rewards/logprob_reward/std": 0.0057501113042235374,
      "step": 1203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 692.03125,
      "completions/mean_terminated_length": 681.3225708007812,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 3.7160493827160495,
      "grad_norm": 1.9580968722110697,
      "kl": 0.2088623046875,
      "learning_rate": 8.20003202072674e-08,
      "loss": -0.2147,
      "num_tokens": 34290075.0,
      "reward": 0.055211372673511505,
      "reward_std": 0.049151401966810226,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0023181959986686707,
      "rewards/logprob_reward/std": 0.0029256767593324184,
      "step": 1204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1009.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 669.65625,
      "completions/mean_terminated_length": 669.65625,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 3.7191358024691357,
      "grad_norm": 1.8875933590230973,
      "kl": 0.233154296875,
      "learning_rate": 8.163042821663507e-08,
      "loss": -0.2322,
      "num_tokens": 34317568.0,
      "reward": 0.04830477386713028,
      "reward_std": 0.03416569530963898,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001588636077940464,
      "rewards/logprob_reward/std": 0.002541485708206892,
      "step": 1205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 761.53125,
      "completions/mean_terminated_length": 744.0333862304688,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 3.7222222222222223,
      "grad_norm": 1.548060413554958,
      "kl": 0.20703125,
      "learning_rate": 8.126120952944987e-08,
      "loss": -0.0375,
      "num_tokens": 34348365.0,
      "reward": 0.049736231565475464,
      "reward_std": 0.04507913067936897,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003179146908223629,
      "rewards/logprob_reward/std": 0.006874183192849159,
      "step": 1206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 679.59375,
      "completions/mean_terminated_length": 668.4838256835938,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 3.7253086419753085,
      "grad_norm": 1.722266109401197,
      "kl": 0.231689453125,
      "learning_rate": 8.089266562220312e-08,
      "loss": -0.0794,
      "num_tokens": 34376276.0,
      "reward": 0.049007855355739594,
      "reward_std": 0.03966151177883148,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0023698355071246624,
      "rewards/logprob_reward/std": 0.003103325143456459,
      "step": 1207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 725.4375,
      "completions/mean_terminated_length": 715.8064575195312,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 3.728395061728395,
      "grad_norm": 2.1382150265884814,
      "kl": 0.2340087890625,
      "learning_rate": 8.052479796868784e-08,
      "loss": -0.1193,
      "num_tokens": 34406058.0,
      "reward": 0.046502694487571716,
      "reward_std": 0.041052673012018204,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0030585499480366707,
      "rewards/logprob_reward/std": 0.0041487994603812695,
      "step": 1208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 744.84375,
      "completions/mean_terminated_length": 726.2333984375,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 3.7314814814814814,
      "grad_norm": 2.07009566028124,
      "kl": 0.23486328125,
      "learning_rate": 8.015760803999244e-08,
      "loss": -0.0924,
      "num_tokens": 34436981.0,
      "reward": 0.03666501119732857,
      "reward_std": 0.0349053293466568,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0025444573257118464,
      "rewards/logprob_reward/std": 0.0035742786712944508,
      "step": 1209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 661.53125,
      "completions/mean_terminated_length": 649.8386840820312,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 3.734567901234568,
      "grad_norm": 2.7000418451746033,
      "kl": 0.236328125,
      "learning_rate": 7.979109730449552e-08,
      "loss": -0.2496,
      "num_tokens": 34464586.0,
      "reward": 0.04257820546627045,
      "reward_std": 0.046489086002111435,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0021702260710299015,
      "rewards/logprob_reward/std": 0.002300801919773221,
      "step": 1210
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 747.78125,
      "completions/mean_terminated_length": 696.629638671875,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 3.7376543209876543,
      "grad_norm": 1.8317555702504096,
      "kl": NaN,
      "learning_rate": 7.942526722785927e-08,
      "loss": -0.066,
      "num_tokens": 34495327.0,
      "reward": 0.04541856423020363,
      "reward_std": 0.04059213399887085,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0018539587035775185,
      "rewards/logprob_reward/std": 0.002871278440579772,
      "step": 1211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 743.21875,
      "completions/mean_terminated_length": 714.1724243164062,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 3.7407407407407405,
      "grad_norm": 2.0344929358154697,
      "kl": 0.2208251953125,
      "learning_rate": 7.906011927302417e-08,
      "loss": -0.0623,
      "num_tokens": 34525486.0,
      "reward": 0.04631774500012398,
      "reward_std": 0.03879896551370621,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002853048499673605,
      "rewards/logprob_reward/std": 0.0040617105551064014,
      "step": 1212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 693.21875,
      "completions/mean_terminated_length": 682.54833984375,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 3.743827160493827,
      "grad_norm": 2.400078149946603,
      "kl": 0.259765625,
      "learning_rate": 7.869565490020288e-08,
      "loss": -0.2001,
      "num_tokens": 34554457.0,
      "reward": 0.03221950680017471,
      "reward_std": 0.04514000564813614,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0010772309033200145,
      "rewards/logprob_reward/std": 0.0015260858926922083,
      "step": 1213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 748.8125,
      "completions/mean_terminated_length": 697.8518676757812,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 3.746913580246914,
      "grad_norm": 2.865671207904787,
      "kl": 0.248046875,
      "learning_rate": 7.833187556687443e-08,
      "loss": -0.1345,
      "num_tokens": 34585211.0,
      "reward": 0.03613856062293053,
      "reward_std": 0.033397313207387924,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.001959512010216713,
      "rewards/logprob_reward/std": 0.004207263700664043,
      "step": 1214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 994.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 721.59375,
      "completions/mean_terminated_length": 721.59375,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 3.75,
      "grad_norm": 1.7862320520107289,
      "kl": 0.2359619140625,
      "learning_rate": 7.796878272777835e-08,
      "loss": -0.1372,
      "num_tokens": 34614774.0,
      "reward": 0.0359145887196064,
      "reward_std": 0.041065141558647156,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0017106522573158145,
      "rewards/logprob_reward/std": 0.002327613066881895,
      "step": 1215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 700.65625,
      "completions/mean_terminated_length": 690.2257690429688,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 3.753086419753086,
      "grad_norm": 1.6872003359318692,
      "kl": 0.2388916015625,
      "learning_rate": 7.760637783490906e-08,
      "loss": -0.0253,
      "num_tokens": 34643783.0,
      "reward": 0.0515599325299263,
      "reward_std": 0.04761520400643349,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0017332553397864103,
      "rewards/logprob_reward/std": 0.0022615049965679646,
      "step": 1216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 659.90625,
      "completions/mean_terminated_length": 635.6333618164062,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 3.756172839506173,
      "grad_norm": 1.8086629239116327,
      "kl": 0.2423095703125,
      "learning_rate": 7.724466233750961e-08,
      "loss": -0.1475,
      "num_tokens": 34670932.0,
      "reward": 0.05709145590662956,
      "reward_std": 0.04678111895918846,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.000934947922360152,
      "rewards/logprob_reward/std": 0.0011365532409399748,
      "step": 1217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 713.5,
      "completions/mean_terminated_length": 692.800048828125,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 3.7592592592592595,
      "grad_norm": 1.695154414779429,
      "kl": 0.228515625,
      "learning_rate": 7.688363768206651e-08,
      "loss": -0.0119,
      "num_tokens": 34700416.0,
      "reward": 0.03903389722108841,
      "reward_std": 0.03349316865205765,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0017043291591107845,
      "rewards/logprob_reward/std": 0.0036165001802146435,
      "step": 1218
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 785.09375,
      "completions/mean_terminated_length": 750.9642944335938,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 3.7623456790123457,
      "grad_norm": 1.7150936142135094,
      "kl": NaN,
      "learning_rate": 7.652330531230344e-08,
      "loss": -0.1117,
      "num_tokens": 34732815.0,
      "reward": 0.023483876138925552,
      "reward_std": 0.019030466675758362,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0017876423662528396,
      "rewards/logprob_reward/std": 0.0036532750818878412,
      "step": 1219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 966.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 723.71875,
      "completions/mean_terminated_length": 723.71875,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 3.765432098765432,
      "grad_norm": 1.827179034852,
      "kl": 0.2166748046875,
      "learning_rate": 7.616366666917571e-08,
      "loss": -0.0628,
      "num_tokens": 34762474.0,
      "reward": 0.043150365352630615,
      "reward_std": 0.04825136065483093,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0028059615287929773,
      "rewards/logprob_reward/std": 0.00501100393012166,
      "step": 1220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 957.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 680.1875,
      "completions/mean_terminated_length": 680.1875,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 3.7685185185185186,
      "grad_norm": 2.2984840229526817,
      "kl": 0.2330322265625,
      "learning_rate": 7.580472319086442e-08,
      "loss": -0.1623,
      "num_tokens": 34790476.0,
      "reward": 0.05447046086192131,
      "reward_std": 0.05344248563051224,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0014949534088373184,
      "rewards/logprob_reward/std": 0.001674546510912478,
      "step": 1221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 763.09375,
      "completions/mean_terminated_length": 725.8214721679688,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 3.771604938271605,
      "grad_norm": 1.9419510096840515,
      "kl": 0.348876953125,
      "learning_rate": 7.544647631277085e-08,
      "loss": -0.2638,
      "num_tokens": 34821555.0,
      "reward": 0.04223502427339554,
      "reward_std": 0.029497407376766205,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0017889136215671897,
      "rewards/logprob_reward/std": 0.002690923633053899,
      "step": 1222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 767.96875,
      "completions/mean_terminated_length": 682.625,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 3.7746913580246915,
      "grad_norm": 4.038824756495392,
      "kl": 0.26171875,
      "learning_rate": 7.508892746751034e-08,
      "loss": -0.4471,
      "num_tokens": 34853166.0,
      "reward": 0.020810972899198532,
      "reward_std": 0.030478069558739662,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0022899697069078684,
      "rewards/logprob_reward/std": 0.005298912525177002,
      "step": 1223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 724.40625,
      "completions/mean_terminated_length": 714.741943359375,
      "completions/min_length": 524.0,
      "completions/min_terminated_length": 524.0,
      "epoch": 3.7777777777777777,
      "grad_norm": 2.502944826698859,
      "kl": 0.2000732421875,
      "learning_rate": 7.473207808490701e-08,
      "loss": -0.4054,
      "num_tokens": 34883583.0,
      "reward": 0.036741383373737335,
      "reward_std": 0.04447250813245773,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.002629311755299568,
      "rewards/logprob_reward/std": 0.0031205445993691683,
      "step": 1224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 716.78125,
      "completions/mean_terminated_length": 706.8709716796875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "epoch": 3.7808641975308643,
      "grad_norm": 1.8493755286529348,
      "kl": 0.2239990234375,
      "learning_rate": 7.437592959198796e-08,
      "loss": -0.2095,
      "num_tokens": 34913224.0,
      "reward": 0.045536018908023834,
      "reward_std": 0.04138926416635513,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00198446586728096,
      "rewards/logprob_reward/std": 0.0033418447710573673,
      "step": 1225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 732.75,
      "completions/mean_terminated_length": 691.1428833007812,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 3.7839506172839505,
      "grad_norm": 1.9340080967606625,
      "kl": 0.2315673828125,
      "learning_rate": 7.402048341297718e-08,
      "loss": -0.004,
      "num_tokens": 34942832.0,
      "reward": 0.03293833136558533,
      "reward_std": 0.04251699149608612,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0018759226659312844,
      "rewards/logprob_reward/std": 0.0024454020895063877,
      "step": 1226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 673.4375,
      "completions/mean_terminated_length": 662.1290283203125,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 3.787037037037037,
      "grad_norm": 2.113541289678668,
      "kl": 0.2188720703125,
      "learning_rate": 7.36657409692903e-08,
      "loss": -0.0908,
      "num_tokens": 34970450.0,
      "reward": 0.03614543378353119,
      "reward_std": 0.0392109677195549,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0019671465270221233,
      "rewards/logprob_reward/std": 0.0022074554581195116,
      "step": 1227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 713.09375,
      "completions/mean_terminated_length": 680.9310302734375,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 3.7901234567901234,
      "grad_norm": 2.2451632956676395,
      "kl": 0.2337646484375,
      "learning_rate": 7.331170367952874e-08,
      "loss": -0.1257,
      "num_tokens": 34999701.0,
      "reward": 0.03641784191131592,
      "reward_std": 0.041536085307598114,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.002269826363772154,
      "rewards/logprob_reward/std": 0.0030190160032361746,
      "step": 1228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 759.25,
      "completions/mean_terminated_length": 710.2222290039062,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 3.7932098765432096,
      "grad_norm": 1.790719025447086,
      "kl": 0.2132568359375,
      "learning_rate": 7.295837295947404e-08,
      "loss": -0.0636,
      "num_tokens": 35030449.0,
      "reward": 0.04219186305999756,
      "reward_std": 0.041881777346134186,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0017409594729542732,
      "rewards/logprob_reward/std": 0.0025964633096009493,
      "step": 1229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 720.75,
      "completions/mean_terminated_length": 700.5333862304688,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.7962962962962963,
      "grad_norm": 2.109531348390788,
      "kl": 0.244873046875,
      "learning_rate": 7.260575022208218e-08,
      "loss": -0.2573,
      "num_tokens": 35059809.0,
      "reward": 0.0454866886138916,
      "reward_std": 0.04775111377239227,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0019296524114906788,
      "rewards/logprob_reward/std": 0.0025852415710687637,
      "step": 1230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 991.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 674.25,
      "completions/mean_terminated_length": 674.25,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 3.799382716049383,
      "grad_norm": 2.0918421209421596,
      "kl": 0.245361328125,
      "learning_rate": 7.225383687747789e-08,
      "loss": -0.0674,
      "num_tokens": 35087617.0,
      "reward": 0.06574690341949463,
      "reward_std": 0.047928690910339355,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0036076600663363934,
      "rewards/logprob_reward/std": 0.004685068968683481,
      "step": 1231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 723.0,
      "completions/mean_terminated_length": 691.862060546875,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 3.802469135802469,
      "grad_norm": 2.1232772554206014,
      "kl": 0.2265625,
      "learning_rate": 7.190263433294913e-08,
      "loss": -0.255,
      "num_tokens": 35117245.0,
      "reward": 0.04533160477876663,
      "reward_std": 0.0413593128323555,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017573356162756681,
      "rewards/logprob_reward/std": 0.0032630939967930317,
      "step": 1232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 743.125,
      "completions/mean_terminated_length": 714.0689697265625,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 3.8055555555555554,
      "grad_norm": 2.0827352450174166,
      "kl": 0.2052001953125,
      "learning_rate": 7.155214399294146e-08,
      "loss": -0.1668,
      "num_tokens": 35147805.0,
      "reward": 0.033200979232788086,
      "reward_std": 0.04059072211384773,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0021677513141185045,
      "rewards/logprob_reward/std": 0.0030180695466697216,
      "step": 1233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 794.0,
      "completions/mean_length": 652.6875,
      "completions/mean_terminated_length": 627.933349609375,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.808641975308642,
      "grad_norm": 1.965401530823781,
      "kl": 0.2252197265625,
      "learning_rate": 7.120236725905215e-08,
      "loss": -0.2894,
      "num_tokens": 35174631.0,
      "reward": 0.06410109996795654,
      "reward_std": 0.04031594097614288,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0017790001584216952,
      "rewards/logprob_reward/std": 0.0020099529065191746,
      "step": 1234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 713.96875,
      "completions/mean_terminated_length": 669.6785888671875,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 3.8117283950617287,
      "grad_norm": 2.2498048608172185,
      "kl": 0.25146484375,
      "learning_rate": 7.085330553002494e-08,
      "loss": -0.3914,
      "num_tokens": 35203822.0,
      "reward": 0.04209384322166443,
      "reward_std": 0.0550503246486187,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0016320511931553483,
      "rewards/logprob_reward/std": 0.003687640419229865,
      "step": 1235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 746.28125,
      "completions/mean_terminated_length": 694.8518676757812,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 3.814814814814815,
      "grad_norm": 2.1615279386635815,
      "kl": 0.2513427734375,
      "learning_rate": 7.05049602017444e-08,
      "loss": -0.2149,
      "num_tokens": 35234335.0,
      "reward": 0.05331094563007355,
      "reward_std": 0.0348031111061573,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0036788287106901407,
      "rewards/logprob_reward/std": 0.005828774534165859,
      "step": 1236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 688.15625,
      "completions/mean_terminated_length": 665.7667236328125,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 3.817901234567901,
      "grad_norm": 2.809253845645748,
      "kl": 0.26416015625,
      "learning_rate": 7.015733266722993e-08,
      "loss": -0.2882,
      "num_tokens": 35263092.0,
      "reward": 0.03941737860441208,
      "reward_std": 0.03392641246318817,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.00213041715323925,
      "rewards/logprob_reward/std": 0.003829734865576029,
      "step": 1237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 695.28125,
      "completions/mean_terminated_length": 673.36669921875,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 3.8209876543209877,
      "grad_norm": 2.3128877488742745,
      "kl": 0.2628173828125,
      "learning_rate": 6.981042431663075e-08,
      "loss": -0.3074,
      "num_tokens": 35291793.0,
      "reward": 0.05240838974714279,
      "reward_std": 0.05356854945421219,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0026759887114167213,
      "rewards/logprob_reward/std": 0.005153441336005926,
      "step": 1238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 992.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 699.75,
      "completions/mean_terminated_length": 699.75,
      "completions/min_length": 533.0,
      "completions/min_terminated_length": 533.0,
      "epoch": 3.824074074074074,
      "grad_norm": 1.985619890073846,
      "kl": 0.2230224609375,
      "learning_rate": 6.946423653722006e-08,
      "loss": 0.0096,
      "num_tokens": 35320597.0,
      "reward": 0.046561695635318756,
      "reward_std": 0.055780161172151566,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0031241043470799923,
      "rewards/logprob_reward/std": 0.004135539289563894,
      "step": 1239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 667.46875,
      "completions/mean_terminated_length": 655.9677124023438,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 3.8271604938271606,
      "grad_norm": 1.925630907400574,
      "kl": 0.2232666015625,
      "learning_rate": 6.911877071338942e-08,
      "loss": -0.0786,
      "num_tokens": 35348544.0,
      "reward": 0.04542498663067818,
      "reward_std": 0.05450732633471489,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0018610956612974405,
      "rewards/logprob_reward/std": 0.0020276005379855633,
      "step": 1240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 723.125,
      "completions/mean_terminated_length": 703.0667114257812,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 3.830246913580247,
      "grad_norm": 2.203898023208827,
      "kl": 0.22216796875,
      "learning_rate": 6.877402822664352e-08,
      "loss": -0.2233,
      "num_tokens": 35377924.0,
      "reward": 0.048514775931835175,
      "reward_std": 0.05417148023843765,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0018219701014459133,
      "rewards/logprob_reward/std": 0.002444783691316843,
      "step": 1241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 782.09375,
      "completions/mean_terminated_length": 726.269287109375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 3.8333333333333335,
      "grad_norm": 2.114242025985549,
      "kl": 0.203369140625,
      "learning_rate": 6.843001045559416e-08,
      "loss": -0.1085,
      "num_tokens": 35409619.0,
      "reward": 0.045830316841602325,
      "reward_std": 0.040851891040802,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0023114606738090515,
      "rewards/logprob_reward/std": 0.0027213245630264282,
      "step": 1242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 801.90625,
      "completions/mean_terminated_length": 760.7777709960938,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 3.8364197530864197,
      "grad_norm": 3.3803453345376386,
      "kl": 0.2239990234375,
      "learning_rate": 6.808671877595524e-08,
      "loss": -0.2605,
      "num_tokens": 35442532.0,
      "reward": 0.013396810740232468,
      "reward_std": 0.02067142352461815,
      "rewards/format_reward_func/mean": 0.125,
      "rewards/format_reward_func/std": 0.33601075410842896,
      "rewards/logprob_reward/mean": 0.000996455317363143,
      "rewards/logprob_reward/std": 0.0018346697324886918,
      "step": 1243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 927.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 703.75,
      "completions/mean_terminated_length": 703.75,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 3.8395061728395063,
      "grad_norm": 2.080147097218789,
      "kl": 0.2276611328125,
      "learning_rate": 6.774415456053697e-08,
      "loss": -0.2862,
      "num_tokens": 35471864.0,
      "reward": 0.06176084280014038,
      "reward_std": 0.054994165897369385,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0026509352028369904,
      "rewards/logprob_reward/std": 0.004046175163239241,
      "step": 1244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 707.03125,
      "completions/mean_terminated_length": 696.8064575195312,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.8425925925925926,
      "grad_norm": 2.0884837740930595,
      "kl": 0.24169921875,
      "learning_rate": 6.740231917924053e-08,
      "loss": -0.0457,
      "num_tokens": 35500845.0,
      "reward": 0.04472806304693222,
      "reward_std": 0.047552864998579025,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0010867358651012182,
      "rewards/logprob_reward/std": 0.0016396671999245882,
      "step": 1245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 733.6875,
      "completions/mean_terminated_length": 703.6551513671875,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 3.8456790123456788,
      "grad_norm": 2.299257104666202,
      "kl": 0.2156982421875,
      "learning_rate": 6.706121399905245e-08,
      "loss": -0.1831,
      "num_tokens": 35531067.0,
      "reward": 0.045227088034152985,
      "reward_std": 0.053179167211055756,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001641207723878324,
      "rewards/logprob_reward/std": 0.002661538077518344,
      "step": 1246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 707.5625,
      "completions/mean_terminated_length": 662.357177734375,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 3.8487654320987654,
      "grad_norm": 2.7574531391395247,
      "kl": 0.249755859375,
      "learning_rate": 6.672084038403927e-08,
      "loss": -0.261,
      "num_tokens": 35560561.0,
      "reward": 0.04215136915445328,
      "reward_std": 0.048834048211574554,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0016959626227617264,
      "rewards/logprob_reward/std": 0.0026369316037744284,
      "step": 1247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 702.78125,
      "completions/mean_terminated_length": 669.5516967773438,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 3.851851851851852,
      "grad_norm": 2.3353384847628798,
      "kl": 0.2225341796875,
      "learning_rate": 6.638119969534201e-08,
      "loss": -0.2779,
      "num_tokens": 35589510.0,
      "reward": 0.04260419309139252,
      "reward_std": 0.05629081279039383,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0021991045214235783,
      "rewards/logprob_reward/std": 0.002575478982180357,
      "step": 1248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 730.53125,
      "completions/mean_terminated_length": 700.1724243164062,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 3.8549382716049383,
      "grad_norm": 1.8179618245074758,
      "kl": 0.218505859375,
      "learning_rate": 6.604229329117064e-08,
      "loss": -0.114,
      "num_tokens": 35619435.0,
      "reward": 0.048428699374198914,
      "reward_std": 0.04567597061395645,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001726333750411868,
      "rewards/logprob_reward/std": 0.0032993017230182886,
      "step": 1249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 673.625,
      "completions/mean_terminated_length": 637.3793334960938,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.8580246913580245,
      "grad_norm": 2.1642770276808836,
      "kl": 0.25,
      "learning_rate": 6.570412252679894e-08,
      "loss": -0.1348,
      "num_tokens": 35647623.0,
      "reward": 0.04711909219622612,
      "reward_std": 0.040639057755470276,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003743434324860573,
      "rewards/logprob_reward/std": 0.00589523883536458,
      "step": 1250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 661.28125,
      "completions/mean_terminated_length": 649.5806274414062,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 3.861111111111111,
      "grad_norm": 1.9804081329675691,
      "kl": 0.23291015625,
      "learning_rate": 6.536668875455869e-08,
      "loss": -0.1854,
      "num_tokens": 35675420.0,
      "reward": 0.05114155262708664,
      "reward_std": 0.047681014984846115,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0012683928944170475,
      "rewards/logprob_reward/std": 0.0017212912207469344,
      "step": 1251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 689.96875,
      "completions/mean_terminated_length": 679.1935424804688,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 3.8641975308641974,
      "grad_norm": 2.0254402888670957,
      "kl": 0.225830078125,
      "learning_rate": 6.502999332383465e-08,
      "loss": -0.0254,
      "num_tokens": 35703959.0,
      "reward": 0.06178828328847885,
      "reward_std": 0.0480516254901886,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002681422047317028,
      "rewards/logprob_reward/std": 0.0036414964124560356,
      "step": 1252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 700.59375,
      "completions/mean_terminated_length": 690.1612548828125,
      "completions/min_length": 531.0,
      "completions/min_terminated_length": 531.0,
      "epoch": 3.867283950617284,
      "grad_norm": 2.772763898119912,
      "kl": 0.236328125,
      "learning_rate": 6.469403758105894e-08,
      "loss": -0.3749,
      "num_tokens": 35733110.0,
      "reward": 0.04874895513057709,
      "reward_std": 0.04067114740610123,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002082169521600008,
      "rewards/logprob_reward/std": 0.0030771277379244566,
      "step": 1253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 696.34375,
      "completions/mean_terminated_length": 685.774169921875,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 3.8703703703703702,
      "grad_norm": 2.4844023536789477,
      "kl": 0.23388671875,
      "learning_rate": 6.435882286970556e-08,
      "loss": -0.2869,
      "num_tokens": 35762161.0,
      "reward": 0.029596639797091484,
      "reward_std": 0.0414566695690155,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0016351554077118635,
      "rewards/logprob_reward/std": 0.0029583487194031477,
      "step": 1254
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 708.625,
      "completions/mean_terminated_length": 698.4515991210938,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 3.873456790123457,
      "grad_norm": 2.1787411189096044,
      "kl": NaN,
      "learning_rate": 6.402435053028538e-08,
      "loss": -0.1782,
      "num_tokens": 35791397.0,
      "reward": 0.04876616224646568,
      "reward_std": 0.049501799046993256,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0021012898068875074,
      "rewards/logprob_reward/std": 0.003905676770955324,
      "step": 1255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 762.4375,
      "completions/mean_terminated_length": 735.3793334960938,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 3.876543209876543,
      "grad_norm": 2.03078209328597,
      "kl": 0.2298583984375,
      "learning_rate": 6.369062190034036e-08,
      "loss": -0.1787,
      "num_tokens": 35822411.0,
      "reward": 0.04866570979356766,
      "reward_std": 0.047181855887174606,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0019896789453923702,
      "rewards/logprob_reward/std": 0.0026977788656949997,
      "step": 1256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 735.96875,
      "completions/mean_terminated_length": 694.8214721679688,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 3.8796296296296298,
      "grad_norm": 2.2449930248623526,
      "kl": 0.2501220703125,
      "learning_rate": 6.335763831443847e-08,
      "loss": -0.1564,
      "num_tokens": 35852934.0,
      "reward": 0.03295619413256645,
      "reward_std": 0.04246683791279793,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0018957715947180986,
      "rewards/logprob_reward/std": 0.004091357346624136,
      "step": 1257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 751.1875,
      "completions/mean_terminated_length": 700.6666870117188,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 3.882716049382716,
      "grad_norm": 2.0406042596363556,
      "kl": 0.2218017578125,
      "learning_rate": 6.302540110416837e-08,
      "loss": -0.182,
      "num_tokens": 35883040.0,
      "reward": 0.0543404221534729,
      "reward_std": 0.04171931743621826,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001350468141026795,
      "rewards/logprob_reward/std": 0.0019470700062811375,
      "step": 1258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 705.46875,
      "completions/mean_terminated_length": 684.2333984375,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 3.8858024691358026,
      "grad_norm": 2.393062096676376,
      "kl": 0.2298583984375,
      "learning_rate": 6.269391159813372e-08,
      "loss": -0.171,
      "num_tokens": 35912335.0,
      "reward": 0.051343321800231934,
      "reward_std": 0.04753294587135315,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0014925784198567271,
      "rewards/logprob_reward/std": 0.002065872075036168,
      "step": 1259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 686.4375,
      "completions/mean_terminated_length": 663.933349609375,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 3.888888888888889,
      "grad_norm": 2.876764353542928,
      "kl": 0.2431640625,
      "learning_rate": 6.236317112194844e-08,
      "loss": -0.2222,
      "num_tokens": 35940529.0,
      "reward": 0.04829934984445572,
      "reward_std": 0.04704994708299637,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0015826087910681963,
      "rewards/logprob_reward/std": 0.0027521736919879913,
      "step": 1260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 719.40625,
      "completions/mean_terminated_length": 687.8965454101562,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 3.8919753086419755,
      "grad_norm": 2.183774586870629,
      "kl": 0.23486328125,
      "learning_rate": 6.203318099823094e-08,
      "loss": -0.1403,
      "num_tokens": 35970066.0,
      "reward": 0.0399702787399292,
      "reward_std": 0.04147706180810928,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0027447554748505354,
      "rewards/logprob_reward/std": 0.004764461424201727,
      "step": 1261
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 752.625,
      "completions/mean_terminated_length": 724.5516967773438,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 3.8950617283950617,
      "grad_norm": 1.8409742201490922,
      "kl": NaN,
      "learning_rate": 6.17039425465991e-08,
      "loss": -0.098,
      "num_tokens": 36001174.0,
      "reward": 0.03989966958761215,
      "reward_std": 0.027711469680070877,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0026663001626729965,
      "rewards/logprob_reward/std": 0.0033170164097100496,
      "step": 1262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 723.84375,
      "completions/mean_terminated_length": 714.1612548828125,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 3.898148148148148,
      "grad_norm": 2.294005354144012,
      "kl": 0.2147216796875,
      "learning_rate": 6.137545708366476e-08,
      "loss": -0.3552,
      "num_tokens": 36030573.0,
      "reward": 0.05225500836968422,
      "reward_std": 0.04585783928632736,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002505563199520111,
      "rewards/logprob_reward/std": 0.002747626742348075,
      "step": 1263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 731.25,
      "completions/mean_terminated_length": 700.9655151367188,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 3.9012345679012346,
      "grad_norm": 2.3506752127484782,
      "kl": 0.2198486328125,
      "learning_rate": 6.104772592302868e-08,
      "loss": -0.1969,
      "num_tokens": 36059917.0,
      "reward": 0.051935285329818726,
      "reward_std": 0.04809124022722244,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0021503143943846226,
      "rewards/logprob_reward/std": 0.002932285889983177,
      "step": 1264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 697.0625,
      "completions/mean_terminated_length": 675.2667236328125,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 3.9043209876543212,
      "grad_norm": 2.818576525015284,
      "kl": 0.2191162109375,
      "learning_rate": 6.072075037527519e-08,
      "loss": -0.2663,
      "num_tokens": 36088667.0,
      "reward": 0.05194810777902603,
      "reward_std": 0.055018581449985504,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0021645622327923775,
      "rewards/logprob_reward/std": 0.003375524654984474,
      "step": 1265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 736.9375,
      "completions/mean_terminated_length": 717.800048828125,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 3.9074074074074074,
      "grad_norm": 1.935271414649702,
      "kl": 0.2105712890625,
      "learning_rate": 6.039453174796699e-08,
      "loss": -0.0688,
      "num_tokens": 36118697.0,
      "reward": 0.05915181338787079,
      "reward_std": 0.055742938071489334,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0032242366578429937,
      "rewards/logprob_reward/std": 0.006154841743409634,
      "step": 1266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 736.34375,
      "completions/mean_terminated_length": 717.1666870117188,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 3.9104938271604937,
      "grad_norm": 2.5734633842286767,
      "kl": 0.2435302734375,
      "learning_rate": 6.006907134563973e-08,
      "loss": -0.3754,
      "num_tokens": 36148816.0,
      "reward": 0.03280571475625038,
      "reward_std": 0.04166591539978981,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0017285719513893127,
      "rewards/logprob_reward/std": 0.0027676033787429333,
      "step": 1267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 702.84375,
      "completions/mean_terminated_length": 681.433349609375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 3.9135802469135803,
      "grad_norm": 2.0594885812245747,
      "kl": 0.23095703125,
      "learning_rate": 5.974437046979711e-08,
      "loss": -0.1846,
      "num_tokens": 36177675.0,
      "reward": 0.054407186806201935,
      "reward_std": 0.039308175444602966,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001424653921276331,
      "rewards/logprob_reward/std": 0.001887391204945743,
      "step": 1268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 968.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 715.125,
      "completions/mean_terminated_length": 715.125,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 3.9166666666666665,
      "grad_norm": 2.8789717930251757,
      "kl": 0.21484375,
      "learning_rate": 5.9420430418905435e-08,
      "loss": -0.3883,
      "num_tokens": 36206771.0,
      "reward": 0.03529440611600876,
      "reward_std": 0.046531401574611664,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0010215596994385123,
      "rewards/logprob_reward/std": 0.0017232416430488229,
      "step": 1269
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 750.03125,
      "completions/mean_terminated_length": 721.6896362304688,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 3.919753086419753,
      "grad_norm": 1.6858459763404439,
      "kl": NaN,
      "learning_rate": 5.909725248838854e-08,
      "loss": -0.2047,
      "num_tokens": 36237300.0,
      "reward": 0.03822425380349159,
      "reward_std": 0.0403776541352272,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0008047227747738361,
      "rewards/logprob_reward/std": 0.0015066579217091203,
      "step": 1270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 735.65625,
      "completions/mean_terminated_length": 694.4642944335938,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 3.9228395061728394,
      "grad_norm": 1.9435699110560907,
      "kl": 0.2420654296875,
      "learning_rate": 5.877483797062255e-08,
      "loss": -0.0681,
      "num_tokens": 36267285.0,
      "reward": 0.03239945322275162,
      "reward_std": 0.03997500240802765,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0012771696783602238,
      "rewards/logprob_reward/std": 0.0024854436051100492,
      "step": 1271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 768.9375,
      "completions/mean_terminated_length": 742.5516967773438,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 3.925925925925926,
      "grad_norm": 1.6475202535148172,
      "kl": 0.196044921875,
      "learning_rate": 5.845318815493069e-08,
      "loss": -0.1046,
      "num_tokens": 36298643.0,
      "reward": 0.042463794350624084,
      "reward_std": 0.03438322991132736,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0020431033335626125,
      "rewards/logprob_reward/std": 0.0030193571001291275,
      "step": 1272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 727.84375,
      "completions/mean_terminated_length": 708.1000366210938,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 3.9290123456790123,
      "grad_norm": 1.9309150674595645,
      "kl": 0.2166748046875,
      "learning_rate": 5.813230432757829e-08,
      "loss": -0.1629,
      "num_tokens": 36328302.0,
      "reward": 0.05194342881441116,
      "reward_std": 0.040441401302814484,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002159362193197012,
      "rewards/logprob_reward/std": 0.0026275431737303734,
      "step": 1273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 981.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 735.5,
      "completions/mean_terminated_length": 735.5,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 3.932098765432099,
      "grad_norm": 2.143376594191243,
      "kl": 0.2288818359375,
      "learning_rate": 5.781218777176744e-08,
      "loss": -0.0411,
      "num_tokens": 36358698.0,
      "reward": 0.04697496443986893,
      "reward_std": 0.028916679322719574,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003583295736461878,
      "rewards/logprob_reward/std": 0.005220974329859018,
      "step": 1274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 760.5,
      "completions/mean_terminated_length": 722.857177734375,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.935185185185185,
      "grad_norm": 1.7644029435866255,
      "kl": 0.2239990234375,
      "learning_rate": 5.749283976763186e-08,
      "loss": -0.0994,
      "num_tokens": 36389534.0,
      "reward": 0.042009782046079636,
      "reward_std": 0.045120202004909515,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015386473387479782,
      "rewards/logprob_reward/std": 0.0025592546444386244,
      "step": 1275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 768.53125,
      "completions/mean_terminated_length": 732.0357666015625,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.9382716049382713,
      "grad_norm": 2.140303836367389,
      "kl": 0.2249755859375,
      "learning_rate": 5.717426159223204e-08,
      "loss": -0.1209,
      "num_tokens": 36420583.0,
      "reward": 0.026612577959895134,
      "reward_std": 0.04570880904793739,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0017917529912665486,
      "rewards/logprob_reward/std": 0.004275859799236059,
      "step": 1276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 795.875,
      "completions/mean_terminated_length": 753.629638671875,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 3.941358024691358,
      "grad_norm": 1.719896804050668,
      "kl": 0.22021484375,
      "learning_rate": 5.685645451954976e-08,
      "loss": -0.062,
      "num_tokens": 36453515.0,
      "reward": 0.041013821959495544,
      "reward_std": 0.04107628017663956,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.003904248122125864,
      "rewards/logprob_reward/std": 0.006139431614428759,
      "step": 1277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 828.4375,
      "completions/mean_terminated_length": 783.3077392578125,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 3.9444444444444446,
      "grad_norm": 2.233144436167193,
      "kl": 0.19189453125,
      "learning_rate": 5.653941982048333e-08,
      "loss": -0.1086,
      "num_tokens": 36486521.0,
      "reward": 0.048560962080955505,
      "reward_std": 0.04907785356044769,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001873290748335421,
      "rewards/logprob_reward/std": 0.003017270006239414,
      "step": 1278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 736.34375,
      "completions/mean_terminated_length": 706.586181640625,
      "completions/min_length": 515.0,
      "completions/min_terminated_length": 515.0,
      "epoch": 3.947530864197531,
      "grad_norm": 2.7394252149497937,
      "kl": 0.198486328125,
      "learning_rate": 5.6223158762842336e-08,
      "loss": -0.3063,
      "num_tokens": 36516664.0,
      "reward": 0.04506908357143402,
      "reward_std": 0.05461621284484863,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0014656463172286749,
      "rewards/logprob_reward/std": 0.0023357095196843147,
      "step": 1279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 741.84375,
      "completions/mean_terminated_length": 712.6551513671875,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 3.950617283950617,
      "grad_norm": 2.128755839405949,
      "kl": 0.2039794921875,
      "learning_rate": 5.59076726113426e-08,
      "loss": -0.1533,
      "num_tokens": 36546963.0,
      "reward": 0.04205043986439705,
      "reward_std": 0.054709989577531815,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015838217223063111,
      "rewards/logprob_reward/std": 0.0026091295294463634,
      "step": 1280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 700.1875,
      "completions/mean_terminated_length": 653.9285888671875,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 3.9537037037037037,
      "grad_norm": 2.0218528154180695,
      "kl": 0.248779296875,
      "learning_rate": 5.55929626276011e-08,
      "loss": -0.1079,
      "num_tokens": 36575569.0,
      "reward": 0.045324429869651794,
      "reward_std": 0.041458792984485626,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017493651248514652,
      "rewards/logprob_reward/std": 0.0029430529102683067,
      "step": 1281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 803.5,
      "completions/mean_terminated_length": 772.0000610351562,
      "completions/min_length": 563.0,
      "completions/min_terminated_length": 563.0,
      "epoch": 3.9567901234567904,
      "grad_norm": 1.8581704646255597,
      "kl": 0.1986083984375,
      "learning_rate": 5.527903007013099e-08,
      "loss": 0.0187,
      "num_tokens": 36607609.0,
      "reward": 0.03619496896862984,
      "reward_std": 0.041637122631073,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0020221881568431854,
      "rewards/logprob_reward/std": 0.0022943217772990465,
      "step": 1282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 753.125,
      "completions/mean_terminated_length": 725.1034545898438,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 3.9598765432098766,
      "grad_norm": 1.8148890962873936,
      "kl": 0.2056884765625,
      "learning_rate": 5.4965876194336567e-08,
      "loss": -0.0677,
      "num_tokens": 36638517.0,
      "reward": 0.05739375948905945,
      "reward_std": 0.04625089839100838,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0047430675476789474,
      "rewards/logprob_reward/std": 0.006590752396732569,
      "step": 1283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 738.65625,
      "completions/mean_terminated_length": 685.8148193359375,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.962962962962963,
      "grad_norm": 2.1778843692935577,
      "kl": 0.2286376953125,
      "learning_rate": 5.465350225250801e-08,
      "loss": -0.1401,
      "num_tokens": 36668778.0,
      "reward": 0.02642800100147724,
      "reward_std": 0.03563851863145828,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0015866662142798305,
      "rewards/logprob_reward/std": 0.002376316348090768,
      "step": 1284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 684.875,
      "completions/mean_terminated_length": 673.9354858398438,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 3.9660493827160495,
      "grad_norm": 2.012474825375182,
      "kl": 0.2205810546875,
      "learning_rate": 5.4341909493816786e-08,
      "loss": -0.3313,
      "num_tokens": 36697326.0,
      "reward": 0.05250188708305359,
      "reward_std": 0.04848799109458923,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002779874252155423,
      "rewards/logprob_reward/std": 0.00423223152756691,
      "step": 1285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 705.71875,
      "completions/mean_terminated_length": 695.4515991210938,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.9691358024691357,
      "grad_norm": 3.177058697222281,
      "kl": 0.2593994140625,
      "learning_rate": 5.4031099164310314e-08,
      "loss": -0.4042,
      "num_tokens": 36725925.0,
      "reward": 0.025932677090168,
      "reward_std": 0.0403948575258255,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0010363080073148012,
      "rewards/logprob_reward/std": 0.0019922624342143536,
      "step": 1286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 733.09375,
      "completions/mean_terminated_length": 691.5357666015625,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 3.9722222222222223,
      "grad_norm": 2.0828686651994848,
      "kl": 0.233642578125,
      "learning_rate": 5.372107250690719e-08,
      "loss": -0.1359,
      "num_tokens": 36755440.0,
      "reward": 0.04142270237207413,
      "reward_std": 0.04101049154996872,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.000886336958501488,
      "rewards/logprob_reward/std": 0.001125569804571569,
      "step": 1287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 809.0,
      "completions/mean_length": 669.03125,
      "completions/mean_terminated_length": 645.36669921875,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 3.9753086419753085,
      "grad_norm": 2.113311396315076,
      "kl": 0.2552490234375,
      "learning_rate": 5.341183076139219e-08,
      "loss": -0.1674,
      "num_tokens": 36783025.0,
      "reward": 0.0463469922542572,
      "reward_std": 0.040201857686042786,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0028855446726083755,
      "rewards/logprob_reward/std": 0.007167118601500988,
      "step": 1288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1006.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 698.25,
      "completions/mean_terminated_length": 698.25,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 3.978395061728395,
      "grad_norm": 3.0365780822405837,
      "kl": 0.224365234375,
      "learning_rate": 5.310337516441102e-08,
      "loss": -0.1641,
      "num_tokens": 36811629.0,
      "reward": 0.06484922766685486,
      "reward_std": 0.04804805666208267,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0026102494448423386,
      "rewards/logprob_reward/std": 0.002949668560177088,
      "step": 1289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 720.125,
      "completions/mean_terminated_length": 699.86669921875,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 3.9814814814814814,
      "grad_norm": 2.058199466935219,
      "kl": 0.22705078125,
      "learning_rate": 5.279570694946581e-08,
      "loss": -0.3072,
      "num_tokens": 36841117.0,
      "reward": 0.05823837220668793,
      "reward_std": 0.041144661605358124,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0022093013394623995,
      "rewards/logprob_reward/std": 0.0028478633612394333,
      "step": 1290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 738.28125,
      "completions/mean_terminated_length": 719.2333984375,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 3.984567901234568,
      "grad_norm": 1.9958376821667123,
      "kl": 0.205078125,
      "learning_rate": 5.2488827346910015e-08,
      "loss": -0.0794,
      "num_tokens": 36871182.0,
      "reward": 0.05141979455947876,
      "reward_std": 0.04218154773116112,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0015775496140122414,
      "rewards/logprob_reward/std": 0.0016699270345270634,
      "step": 1291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 731.53125,
      "completions/mean_terminated_length": 712.0333862304688,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 3.9876543209876543,
      "grad_norm": 2.0667616398771655,
      "kl": 0.2208251953125,
      "learning_rate": 5.21827375839432e-08,
      "loss": -0.0656,
      "num_tokens": 36901519.0,
      "reward": 0.0433460995554924,
      "reward_std": 0.04242153465747833,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0030234456062316895,
      "rewards/logprob_reward/std": 0.004557882435619831,
      "step": 1292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 747.4375,
      "completions/mean_terminated_length": 707.9285888671875,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 3.9907407407407405,
      "grad_norm": 2.3584565129407253,
      "kl": 0.2208251953125,
      "learning_rate": 5.187743888460669e-08,
      "loss": -0.2497,
      "num_tokens": 36932101.0,
      "reward": 0.04608844593167305,
      "reward_std": 0.04741070419549942,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0025982714723795652,
      "rewards/logprob_reward/std": 0.0033916078973561525,
      "step": 1293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 729.53125,
      "completions/mean_terminated_length": 709.9000244140625,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 3.993827160493827,
      "grad_norm": 1.8885638626101118,
      "kl": 0.205810546875,
      "learning_rate": 5.15729324697782e-08,
      "loss": -0.1506,
      "num_tokens": 36961610.0,
      "reward": 0.05801812931895256,
      "reward_std": 0.04801099747419357,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001964585855603218,
      "rewards/logprob_reward/std": 0.003837130730971694,
      "step": 1294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 682.71875,
      "completions/mean_terminated_length": 671.7096557617188,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 3.996913580246914,
      "grad_norm": 2.107292429850407,
      "kl": 0.244873046875,
      "learning_rate": 5.126921955716723e-08,
      "loss": -0.1056,
      "num_tokens": 36989857.0,
      "reward": 0.04603494703769684,
      "reward_std": 0.04803794249892235,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002538828644901514,
      "rewards/logprob_reward/std": 0.003044616896659136,
      "step": 1295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 743.1875,
      "completions/mean_terminated_length": 734.1290283203125,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 4.0,
      "grad_norm": 2.142740634714608,
      "kl": 0.2208251953125,
      "learning_rate": 5.096630136131e-08,
      "loss": -0.1877,
      "num_tokens": 37019783.0,
      "reward": 0.03996527940034866,
      "reward_std": 0.04848010092973709,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0027391978073865175,
      "rewards/logprob_reward/std": 0.0035382481291890144,
      "step": 1296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 747.34375,
      "completions/mean_terminated_length": 728.9000244140625,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 4.003086419753086,
      "grad_norm": 2.8270253368033162,
      "kl": 0.214599609375,
      "learning_rate": 5.0664179093564765e-08,
      "loss": -0.3346,
      "num_tokens": 37049810.0,
      "reward": 0.04774084687232971,
      "reward_std": 0.05377994477748871,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0009620493510738015,
      "rewards/logprob_reward/std": 0.0015795612707734108,
      "step": 1297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 755.09375,
      "completions/mean_terminated_length": 727.27587890625,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 4.006172839506172,
      "grad_norm": 2.6042754047768164,
      "kl": 0.2205810546875,
      "learning_rate": 5.036285396210685e-08,
      "loss": -0.2129,
      "num_tokens": 37080717.0,
      "reward": 0.02016562968492508,
      "reward_std": 0.033509328961372375,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0015729216393083334,
      "rewards/logprob_reward/std": 0.002818315988406539,
      "step": 1298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 734.0625,
      "completions/mean_terminated_length": 704.0689697265625,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 4.0092592592592595,
      "grad_norm": 2.207656836279446,
      "kl": 0.2156982421875,
      "learning_rate": 5.0062327171923935e-08,
      "loss": -0.1427,
      "num_tokens": 37110499.0,
      "reward": 0.05492895096540451,
      "reward_std": 0.05219528079032898,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.00200439291074872,
      "rewards/logprob_reward/std": 0.002018099185079336,
      "step": 1299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 706.15625,
      "completions/mean_terminated_length": 647.2963256835938,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 4.012345679012346,
      "grad_norm": 1.8314022728111405,
      "kl": 0.2392578125,
      "learning_rate": 4.976259992481097e-08,
      "loss": -0.1421,
      "num_tokens": 37139552.0,
      "reward": 0.05195342376828194,
      "reward_std": 0.03563021868467331,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0021704700775444508,
      "rewards/logprob_reward/std": 0.004170523025095463,
      "step": 1300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 698.75,
      "completions/mean_terminated_length": 677.0667114257812,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 4.015432098765432,
      "grad_norm": 1.7378077558782936,
      "kl": 0.219970703125,
      "learning_rate": 4.946367341936578e-08,
      "loss": -0.1484,
      "num_tokens": 37168652.0,
      "reward": 0.0628993883728981,
      "reward_std": 0.04155369848012924,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.003915988374501467,
      "rewards/logprob_reward/std": 0.003737315535545349,
      "step": 1301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 747.34375,
      "completions/mean_terminated_length": 738.4193115234375,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 4.018518518518518,
      "grad_norm": 2.029197340086473,
      "kl": 0.2149658203125,
      "learning_rate": 4.916554885098403e-08,
      "loss": -0.1367,
      "num_tokens": 37198923.0,
      "reward": 0.0451449491083622,
      "reward_std": 0.04612676426768303,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001549942884594202,
      "rewards/logprob_reward/std": 0.0025856001302599907,
      "step": 1302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 686.15625,
      "completions/mean_terminated_length": 651.2069091796875,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 4.021604938271605,
      "grad_norm": 3.3882430774395242,
      "kl": 0.249267578125,
      "learning_rate": 4.8868227411854287e-08,
      "loss": -0.2959,
      "num_tokens": 37227284.0,
      "reward": 0.04487443342804909,
      "reward_std": 0.04793141037225723,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001249370165169239,
      "rewards/logprob_reward/std": 0.0020175762474536896,
      "step": 1303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 703.78125,
      "completions/mean_terminated_length": 670.6551513671875,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 4.0246913580246915,
      "grad_norm": 1.9171852354819447,
      "kl": 0.2298583984375,
      "learning_rate": 4.857171029095364e-08,
      "loss": -0.0861,
      "num_tokens": 37256053.0,
      "reward": 0.048420801758766174,
      "reward_std": 0.03434731066226959,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0017175577813759446,
      "rewards/logprob_reward/std": 0.002520865062251687,
      "step": 1304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 749.71875,
      "completions/mean_terminated_length": 721.3448486328125,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.027777777777778,
      "grad_norm": 2.2563786099900054,
      "kl": 0.24560546875,
      "learning_rate": 4.827599867404261e-08,
      "loss": -0.266,
      "num_tokens": 37286588.0,
      "reward": 0.04632047563791275,
      "reward_std": 0.05095044896006584,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0028560864739120007,
      "rewards/logprob_reward/std": 0.0039827702566981316,
      "step": 1305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 707.28125,
      "completions/mean_terminated_length": 674.5172119140625,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 4.030864197530864,
      "grad_norm": 1.8861105968798706,
      "kl": 0.220458984375,
      "learning_rate": 4.7981093743660634e-08,
      "loss": -0.0226,
      "num_tokens": 37315921.0,
      "reward": 0.052245382219552994,
      "reward_std": 0.03538450226187706,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002494871150702238,
      "rewards/logprob_reward/std": 0.002427044091746211,
      "step": 1306
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1000.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 736.28125,
      "completions/mean_terminated_length": 736.28125,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 4.033950617283951,
      "grad_norm": 2.401898927796471,
      "kl": NaN,
      "learning_rate": 4.768699667912118e-08,
      "loss": -0.3143,
      "num_tokens": 37346710.0,
      "reward": 0.03275703638792038,
      "reward_std": 0.04243364930152893,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0016744830645620823,
      "rewards/logprob_reward/std": 0.0034032058902084827,
      "step": 1307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 684.3125,
      "completions/mean_terminated_length": 661.6666870117188,
      "completions/min_length": 499.0,
      "completions/min_terminated_length": 499.0,
      "epoch": 4.037037037037037,
      "grad_norm": 1.7161828447911658,
      "kl": 0.2215576171875,
      "learning_rate": 4.739370865650716e-08,
      "loss": -0.1491,
      "num_tokens": 37374792.0,
      "reward": 0.05248447135090828,
      "reward_std": 0.04930894821882248,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002760524395853281,
      "rewards/logprob_reward/std": 0.007413600105792284,
      "step": 1308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 756.78125,
      "completions/mean_terminated_length": 707.2963256835938,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 4.040123456790123,
      "grad_norm": 2.3968823354359907,
      "kl": 0.2513427734375,
      "learning_rate": 4.710123084866602e-08,
      "loss": -0.144,
      "num_tokens": 37406137.0,
      "reward": 0.026766877621412277,
      "reward_std": 0.03382844850420952,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.001963196322321892,
      "rewards/logprob_reward/std": 0.0037106643430888653,
      "step": 1309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 888.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 667.375,
      "completions/mean_terminated_length": 667.375,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.04320987654321,
      "grad_norm": 1.6743847336337871,
      "kl": 0.2391357421875,
      "learning_rate": 4.6809564425205286e-08,
      "loss": -0.0379,
      "num_tokens": 37434045.0,
      "reward": 0.04869449511170387,
      "reward_std": 0.028283096849918365,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0020216605626046658,
      "rewards/logprob_reward/std": 0.0020715193822979927,
      "step": 1310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 741.96875,
      "completions/mean_terminated_length": 723.1666870117188,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 4.046296296296297,
      "grad_norm": 2.6716410427355513,
      "kl": 0.2393798828125,
      "learning_rate": 4.6518710552487796e-08,
      "loss": -0.271,
      "num_tokens": 37464308.0,
      "reward": 0.03659878298640251,
      "reward_std": 0.03978034108877182,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0024708695709705353,
      "rewards/logprob_reward/std": 0.003682315582409501,
      "step": 1311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 680.4375,
      "completions/mean_terminated_length": 644.8965454101562,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 4.049382716049383,
      "grad_norm": 2.2261017763514888,
      "kl": 0.2520751953125,
      "learning_rate": 4.6228670393627014e-08,
      "loss": -0.2621,
      "num_tokens": 37492222.0,
      "reward": 0.054087117314338684,
      "reward_std": 0.048690445721149445,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001069014542736113,
      "rewards/logprob_reward/std": 0.0017158350674435496,
      "step": 1312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 777.84375,
      "completions/mean_terminated_length": 721.0385131835938,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.052469135802469,
      "grad_norm": 1.990666661613414,
      "kl": 0.2265625,
      "learning_rate": 4.5939445108482466e-08,
      "loss": -0.0802,
      "num_tokens": 37523349.0,
      "reward": 0.020132973790168762,
      "reward_std": 0.02895219996571541,
      "rewards/format_reward_func/mean": 0.1875,
      "rewards/format_reward_func/std": 0.3965577781200409,
      "rewards/logprob_reward/mean": 0.0015366380102932453,
      "rewards/logprob_reward/std": 0.002819651272147894,
      "step": 1313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 737.90625,
      "completions/mean_terminated_length": 708.3103637695312,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 4.055555555555555,
      "grad_norm": 2.089280188424189,
      "kl": 0.21875,
      "learning_rate": 4.565103585365479e-08,
      "loss": -0.1981,
      "num_tokens": 37553238.0,
      "reward": 0.057640429586172104,
      "reward_std": 0.04725799337029457,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0015449193306267262,
      "rewards/logprob_reward/std": 0.0019049550173804164,
      "step": 1314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 997.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 666.4375,
      "completions/mean_terminated_length": 666.4375,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.058641975308642,
      "grad_norm": 1.8829293078272262,
      "kl": 0.2283935546875,
      "learning_rate": 4.536344378248161e-08,
      "loss": -0.2137,
      "num_tokens": 37580592.0,
      "reward": 0.07006511092185974,
      "reward_std": 0.045542240142822266,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0014612269587814808,
      "rewards/logprob_reward/std": 0.001866519683972001,
      "step": 1315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 719.5625,
      "completions/mean_terminated_length": 699.2667236328125,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 4.061728395061729,
      "grad_norm": 2.1688479938586833,
      "kl": 0.2685546875,
      "learning_rate": 4.50766700450326e-08,
      "loss": -0.0683,
      "num_tokens": 37610270.0,
      "reward": 0.0513153001666069,
      "reward_std": 0.04761355742812157,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0014614476822316647,
      "rewards/logprob_reward/std": 0.0018111151875928044,
      "step": 1316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 884.0,
      "completions/max_terminated_length": 884.0,
      "completions/mean_length": 649.3125,
      "completions/mean_terminated_length": 649.3125,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 4.064814814814815,
      "grad_norm": 2.2932016624952336,
      "kl": 0.2490234375,
      "learning_rate": 4.479071578810481e-08,
      "loss": -0.1529,
      "num_tokens": 37637104.0,
      "reward": 0.03536916524171829,
      "reward_std": 0.04151330143213272,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.001104630995541811,
      "rewards/logprob_reward/std": 0.0021797718945890665,
      "step": 1317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1000.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 663.84375,
      "completions/mean_terminated_length": 663.84375,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 4.067901234567901,
      "grad_norm": 2.001613550630462,
      "kl": 0.236083984375,
      "learning_rate": 4.450558215521838e-08,
      "loss": -0.0023,
      "num_tokens": 37664911.0,
      "reward": 0.051741935312747955,
      "reward_std": 0.05239560455083847,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0019354848191142082,
      "rewards/logprob_reward/std": 0.0026733444537967443,
      "step": 1318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 994.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 684.90625,
      "completions/mean_terminated_length": 684.90625,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 4.070987654320987,
      "grad_norm": 2.1952092388258446,
      "kl": 0.225341796875,
      "learning_rate": 4.4221270286611765e-08,
      "loss": -0.2355,
      "num_tokens": 37693312.0,
      "reward": 0.036763377487659454,
      "reward_std": 0.04703374579548836,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0026537529192864895,
      "rewards/logprob_reward/std": 0.0032631775829941034,
      "step": 1319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 739.28125,
      "completions/mean_terminated_length": 720.300048828125,
      "completions/min_length": 551.0,
      "completions/min_terminated_length": 551.0,
      "epoch": 4.074074074074074,
      "grad_norm": 1.8023025242789739,
      "kl": 0.221923828125,
      "learning_rate": 4.3937781319237175e-08,
      "loss": -0.1363,
      "num_tokens": 37723541.0,
      "reward": 0.038583770394325256,
      "reward_std": 0.03296907618641853,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0012041892623528838,
      "rewards/logprob_reward/std": 0.0015725565608590841,
      "step": 1320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 825.6875,
      "completions/mean_terminated_length": 748.0869750976562,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 4.077160493827161,
      "grad_norm": 2.5226843610208944,
      "kl": 0.2186279296875,
      "learning_rate": 4.365511638675612e-08,
      "loss": -0.3079,
      "num_tokens": 37756983.0,
      "reward": 0.032925527542829514,
      "reward_std": 0.02233968675136566,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.008806141093373299,
      "rewards/logprob_reward/std": 0.017590748146176338,
      "step": 1321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 704.09375,
      "completions/mean_terminated_length": 693.774169921875,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 4.080246913580247,
      "grad_norm": 1.5330479855867665,
      "kl": 0.2310791015625,
      "learning_rate": 4.337327661953477e-08,
      "loss": -0.07,
      "num_tokens": 37785814.0,
      "reward": 0.054545577615499496,
      "reward_std": 0.041127897799015045,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001578418887220323,
      "rewards/logprob_reward/std": 0.0018107314826920629,
      "step": 1322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 730.1875,
      "completions/mean_terminated_length": 699.7930908203125,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 4.083333333333333,
      "grad_norm": 3.4392079834484965,
      "kl": 0.2095947265625,
      "learning_rate": 4.3092263144639565e-08,
      "loss": -0.3143,
      "num_tokens": 37815484.0,
      "reward": 0.05121327191591263,
      "reward_std": 0.05133683979511261,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0013480777852237225,
      "rewards/logprob_reward/std": 0.0025840946473181248,
      "step": 1323
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 743.90625,
      "completions/mean_terminated_length": 725.2333984375,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 4.08641975308642,
      "grad_norm": 1.9214781567583756,
      "kl": NaN,
      "learning_rate": 4.281207708583256e-08,
      "loss": -0.0852,
      "num_tokens": 37846061.0,
      "reward": 0.04986279085278511,
      "reward_std": 0.04641471430659294,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003319765906780958,
      "rewards/logprob_reward/std": 0.004062604624778032,
      "step": 1324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 743.5625,
      "completions/mean_terminated_length": 714.5516967773438,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 4.089506172839506,
      "grad_norm": 2.366127547715954,
      "kl": 0.218994140625,
      "learning_rate": 4.253271956356713e-08,
      "loss": -0.1787,
      "num_tokens": 37876043.0,
      "reward": 0.051838453859090805,
      "reward_std": 0.04091264307498932,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0020427240524441004,
      "rewards/logprob_reward/std": 0.002830656711012125,
      "step": 1325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 763.28125,
      "completions/mean_terminated_length": 736.3103637695312,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 512.0,
      "epoch": 4.092592592592593,
      "grad_norm": 2.2531084527035894,
      "kl": 0.2030029296875,
      "learning_rate": 4.2254191694983096e-08,
      "loss": -0.0391,
      "num_tokens": 37907116.0,
      "reward": 0.0456613227725029,
      "reward_std": 0.040709540247917175,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0021236930042505264,
      "rewards/logprob_reward/std": 0.002349415561184287,
      "step": 1326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 738.78125,
      "completions/mean_terminated_length": 709.27587890625,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 4.095679012345679,
      "grad_norm": 3.8007254694939427,
      "kl": 0.22900390625,
      "learning_rate": 4.197649459390287e-08,
      "loss": -0.3119,
      "num_tokens": 37937241.0,
      "reward": 0.02436847798526287,
      "reward_std": 0.04079977050423622,
      "rewards/format_reward_func/mean": 0.21875,
      "rewards/format_reward_func/std": 0.420013427734375,
      "rewards/logprob_reward/mean": 0.0027705305255949497,
      "rewards/logprob_reward/std": 0.004038138780742884,
      "step": 1327
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 753.0625,
      "completions/mean_terminated_length": 735.0000610351562,
      "completions/min_length": 499.0,
      "completions/min_terminated_length": 499.0,
      "epoch": 4.098765432098766,
      "grad_norm": 2.401523271494944,
      "kl": NaN,
      "learning_rate": 4.169962937082635e-08,
      "loss": -0.1261,
      "num_tokens": 37968615.0,
      "reward": 0.031107213348150253,
      "reward_std": 0.027722710743546486,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0033135702833533287,
      "rewards/logprob_reward/std": 0.004784339107573032,
      "step": 1328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 677.3125,
      "completions/mean_terminated_length": 654.2000122070312,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 4.101851851851852,
      "grad_norm": 1.9800827156216652,
      "kl": 0.2451171875,
      "learning_rate": 4.142359713292698e-08,
      "loss": -0.032,
      "num_tokens": 37996817.0,
      "reward": 0.05676977336406708,
      "reward_std": 0.03475790470838547,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.004049746319651604,
      "rewards/logprob_reward/std": 0.003584577701985836,
      "step": 1329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 703.9375,
      "completions/mean_terminated_length": 682.6000366210938,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 4.104938271604938,
      "grad_norm": 2.2064144612947807,
      "kl": 0.241455078125,
      "learning_rate": 4.11483989840471e-08,
      "loss": -0.2097,
      "num_tokens": 38025819.0,
      "reward": 0.04861665517091751,
      "reward_std": 0.04186452925205231,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001935172826051712,
      "rewards/logprob_reward/std": 0.0028661531396210194,
      "step": 1330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 766.90625,
      "completions/mean_terminated_length": 740.3103637695312,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.1080246913580245,
      "grad_norm": 2.255648257423402,
      "kl": 0.208251953125,
      "learning_rate": 4.087403602469347e-08,
      "loss": -0.132,
      "num_tokens": 38057124.0,
      "reward": 0.05841232091188431,
      "reward_std": 0.054690130054950714,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0024025829043239355,
      "rewards/logprob_reward/std": 0.003108304226770997,
      "step": 1331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 711.40625,
      "completions/mean_terminated_length": 701.3225708007812,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 4.111111111111111,
      "grad_norm": 2.5811801352317443,
      "kl": 0.2423095703125,
      "learning_rate": 4.060050935203307e-08,
      "loss": -0.231,
      "num_tokens": 38086841.0,
      "reward": 0.04872202128171921,
      "reward_std": 0.044247761368751526,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.005524467211216688,
      "rewards/logprob_reward/std": 0.01583181507885456,
      "step": 1332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 738.59375,
      "completions/mean_terminated_length": 697.8214721679688,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 4.114197530864198,
      "grad_norm": 2.185577263120209,
      "kl": 0.2388916015625,
      "learning_rate": 4.032782005988861e-08,
      "loss": -0.011,
      "num_tokens": 38117336.0,
      "reward": 0.041978344321250916,
      "reward_std": 0.03934880346059799,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015037164557725191,
      "rewards/logprob_reward/std": 0.0026061886455863714,
      "step": 1333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 758.65625,
      "completions/mean_terminated_length": 740.9666748046875,
      "completions/min_length": 543.0,
      "completions/min_terminated_length": 543.0,
      "epoch": 4.117283950617284,
      "grad_norm": 1.8785321943038986,
      "kl": 0.218505859375,
      "learning_rate": 4.0055969238733945e-08,
      "loss": -0.0958,
      "num_tokens": 38147733.0,
      "reward": 0.03570020571351051,
      "reward_std": 0.04679879546165466,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0014724504435434937,
      "rewards/logprob_reward/std": 0.0028168363496661186,
      "step": 1334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 751.84375,
      "completions/mean_terminated_length": 701.4444580078125,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 4.12037037037037,
      "grad_norm": 1.931916647953982,
      "kl": 0.2218017578125,
      "learning_rate": 3.978495797569012e-08,
      "loss": -0.0929,
      "num_tokens": 38178636.0,
      "reward": 0.043558232486248016,
      "reward_std": 0.042037613689899445,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0032591475173830986,
      "rewards/logprob_reward/std": 0.0042504193261265755,
      "step": 1335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 687.375,
      "completions/mean_terminated_length": 652.5516967773438,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 4.1234567901234565,
      "grad_norm": 2.478648178978233,
      "kl": 0.236083984375,
      "learning_rate": 3.95147873545208e-08,
      "loss": -0.2018,
      "num_tokens": 38207168.0,
      "reward": 0.051688190549612045,
      "reward_std": 0.04069924354553223,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0018757663201540709,
      "rewards/logprob_reward/std": 0.0025266511365771294,
      "step": 1336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 757.5,
      "completions/mean_terminated_length": 739.7333984375,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 4.1265432098765435,
      "grad_norm": 2.024372736436707,
      "kl": 0.22265625,
      "learning_rate": 3.924545845562791e-08,
      "loss": -0.2783,
      "num_tokens": 38237936.0,
      "reward": 0.027438897639513016,
      "reward_std": 0.03564389795064926,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.00270988536067307,
      "rewards/logprob_reward/std": 0.0028934788424521685,
      "step": 1337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 730.125,
      "completions/mean_terminated_length": 688.1428833007812,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 4.12962962962963,
      "grad_norm": 2.0183830172881025,
      "kl": 0.2080078125,
      "learning_rate": 3.8976972356047325e-08,
      "loss": -0.0717,
      "num_tokens": 38268200.0,
      "reward": 0.03374570980668068,
      "reward_std": 0.04171259328722954,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0027730122674256563,
      "rewards/logprob_reward/std": 0.006873125210404396,
      "step": 1338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 709.625,
      "completions/mean_terminated_length": 677.1034545898438,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 4.132716049382716,
      "grad_norm": 1.9753625961217376,
      "kl": 0.2222900390625,
      "learning_rate": 3.870933012944472e-08,
      "loss": -0.1405,
      "num_tokens": 38297508.0,
      "reward": 0.049009934067726135,
      "reward_std": 0.04322347044944763,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002372150309383869,
      "rewards/logprob_reward/std": 0.0031108625698834658,
      "step": 1339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 694.15625,
      "completions/mean_terminated_length": 672.1666870117188,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 4.135802469135802,
      "grad_norm": 1.7466544081334954,
      "kl": 0.2421875,
      "learning_rate": 3.844253284611096e-08,
      "loss": -0.1519,
      "num_tokens": 38326157.0,
      "reward": 0.05518614873290062,
      "reward_std": 0.046291884034872055,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0022901634220033884,
      "rewards/logprob_reward/std": 0.0030652545392513275,
      "step": 1340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 656.625,
      "completions/mean_terminated_length": 644.774169921875,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 4.138888888888889,
      "grad_norm": 1.923109938186103,
      "kl": 0.247802734375,
      "learning_rate": 3.817658157295819e-08,
      "loss": -0.1293,
      "num_tokens": 38353365.0,
      "reward": 0.06859253346920013,
      "reward_std": 0.054425276815891266,
      "rewards/format_reward_func/mean": 0.65625,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0032972614280879498,
      "rewards/logprob_reward/std": 0.0040869941003620625,
      "step": 1341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 730.625,
      "completions/mean_terminated_length": 711.0667114257812,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.1419753086419755,
      "grad_norm": 2.170302985266013,
      "kl": 0.227294921875,
      "learning_rate": 3.791147737351541e-08,
      "loss": -0.1534,
      "num_tokens": 38383141.0,
      "reward": 0.04599742218852043,
      "reward_std": 0.048829492181539536,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0024971363600343466,
      "rewards/logprob_reward/std": 0.0028372197411954403,
      "step": 1342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 752.1875,
      "completions/mean_terminated_length": 689.4615478515625,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 4.145061728395062,
      "grad_norm": 2.2790726792037233,
      "kl": 0.23046875,
      "learning_rate": 3.7647221307923946e-08,
      "loss": -0.2172,
      "num_tokens": 38413699.0,
      "reward": 0.050827592611312866,
      "reward_std": 0.045222967863082886,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0009195467573590577,
      "rewards/logprob_reward/std": 0.001251706387847662,
      "step": 1343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 752.34375,
      "completions/mean_terminated_length": 734.2333984375,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 4.148148148148148,
      "grad_norm": 2.228274365183252,
      "kl": 0.2052001953125,
      "learning_rate": 3.738381443293376e-08,
      "loss": -0.2613,
      "num_tokens": 38444310.0,
      "reward": 0.026713576167821884,
      "reward_std": 0.040478356182575226,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0019039744511246681,
      "rewards/logprob_reward/std": 0.0031483571510761976,
      "step": 1344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 690.65625,
      "completions/mean_terminated_length": 679.9031982421875,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 4.151234567901234,
      "grad_norm": 2.3378643080570183,
      "kl": 0.24658203125,
      "learning_rate": 3.7121257801898814e-08,
      "loss": -0.3647,
      "num_tokens": 38472995.0,
      "reward": 0.04639032483100891,
      "reward_std": 0.04832562804222107,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002933695912361145,
      "rewards/logprob_reward/std": 0.0039755916222929955,
      "step": 1345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 689.5625,
      "completions/mean_terminated_length": 654.9655151367188,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 4.154320987654321,
      "grad_norm": 1.982373303359418,
      "kl": 0.2449951171875,
      "learning_rate": 3.685955246477296e-08,
      "loss": -0.1405,
      "num_tokens": 38501369.0,
      "reward": 0.04185140132904053,
      "reward_std": 0.04646693170070648,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0013626698637381196,
      "rewards/logprob_reward/std": 0.0017280688043683767,
      "step": 1346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 682.75,
      "completions/mean_terminated_length": 671.741943359375,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 4.157407407407407,
      "grad_norm": 1.5521349183526207,
      "kl": 0.2357177734375,
      "learning_rate": 3.659869946810581e-08,
      "loss": -0.1462,
      "num_tokens": 38529137.0,
      "reward": 0.06384395062923431,
      "reward_std": 0.03511165454983711,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0014932783087715507,
      "rewards/logprob_reward/std": 0.0016576785128563643,
      "step": 1347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 758.40625,
      "completions/mean_terminated_length": 730.9310302734375,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 4.160493827160494,
      "grad_norm": 2.1717397888291554,
      "kl": 0.2120361328125,
      "learning_rate": 3.6338699855038486e-08,
      "loss": -0.1148,
      "num_tokens": 38560274.0,
      "reward": 0.045266292989254,
      "reward_std": 0.04792287200689316,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001684766379185021,
      "rewards/logprob_reward/std": 0.00244696531444788,
      "step": 1348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 726.875,
      "completions/mean_terminated_length": 696.137939453125,
      "completions/min_length": 499.0,
      "completions/min_terminated_length": 499.0,
      "epoch": 4.16358024691358,
      "grad_norm": 1.9587763234845543,
      "kl": 0.222412109375,
      "learning_rate": 3.6079554665299414e-08,
      "loss": -0.1509,
      "num_tokens": 38590426.0,
      "reward": 0.04997875541448593,
      "reward_std": 0.04206637293100357,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003448616247624159,
      "rewards/logprob_reward/std": 0.00576774962246418,
      "step": 1349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 741.5,
      "completions/mean_terminated_length": 712.27587890625,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 4.166666666666667,
      "grad_norm": 2.354385124226021,
      "kl": 0.2376708984375,
      "learning_rate": 3.5821264935200294e-08,
      "loss": -0.2053,
      "num_tokens": 38620898.0,
      "reward": 0.04564407467842102,
      "reward_std": 0.03507474064826965,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002104530343785882,
      "rewards/logprob_reward/std": 0.003115322906523943,
      "step": 1350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 959.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 637.5625,
      "completions/mean_terminated_length": 637.5625,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 4.169753086419753,
      "grad_norm": 1.6875140352824833,
      "kl": 0.2469482421875,
      "learning_rate": 3.5563831697631776e-08,
      "loss": -0.1708,
      "num_tokens": 38647516.0,
      "reward": 0.05164702236652374,
      "reward_std": 0.04170852527022362,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.001830025459639728,
      "rewards/logprob_reward/std": 0.002140195807442069,
      "step": 1351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 719.03125,
      "completions/mean_terminated_length": 709.1935424804688,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 4.172839506172839,
      "grad_norm": 2.392309633220599,
      "kl": 0.2503662109375,
      "learning_rate": 3.53072559820595e-08,
      "loss": -0.1483,
      "num_tokens": 38677569.0,
      "reward": 0.03700459375977516,
      "reward_std": 0.04947003722190857,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0029217731207609177,
      "rewards/logprob_reward/std": 0.0046602776274085045,
      "step": 1352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 719.4375,
      "completions/mean_terminated_length": 687.9310302734375,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 4.175925925925926,
      "grad_norm": 2.979264233519347,
      "kl": 0.2257080078125,
      "learning_rate": 3.505153881451997e-08,
      "loss": -0.1962,
      "num_tokens": 38707275.0,
      "reward": 0.04795299842953682,
      "reward_std": 0.048882901668548584,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001197776640765369,
      "rewards/logprob_reward/std": 0.0021472196094691753,
      "step": 1353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 719.3125,
      "completions/mean_terminated_length": 699.0000610351562,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 4.179012345679013,
      "grad_norm": 1.9787664379018888,
      "kl": 0.2291259765625,
      "learning_rate": 3.479668121761617e-08,
      "loss": -0.2475,
      "num_tokens": 38737113.0,
      "reward": 0.035641416907310486,
      "reward_std": 0.04145059734582901,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0014071294572204351,
      "rewards/logprob_reward/std": 0.0019757109694182873,
      "step": 1354
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 648.0,
      "completions/mean_terminated_length": 635.8709716796875,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 4.182098765432099,
      "grad_norm": 2.2545876367542177,
      "kl": NaN,
      "learning_rate": 3.45426842105139e-08,
      "loss": -0.1652,
      "num_tokens": 38764177.0,
      "reward": 0.052417002618312836,
      "reward_std": 0.04732451215386391,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0026855559553951025,
      "rewards/logprob_reward/std": 0.00396285206079483,
      "step": 1355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 726.21875,
      "completions/mean_terminated_length": 706.36669921875,
      "completions/min_length": 499.0,
      "completions/min_terminated_length": 499.0,
      "epoch": 4.185185185185185,
      "grad_norm": 2.0719623368070024,
      "kl": 0.234130859375,
      "learning_rate": 3.428954880893745e-08,
      "loss": -0.1586,
      "num_tokens": 38794124.0,
      "reward": 0.03140906244516373,
      "reward_std": 0.04791799187660217,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0036489595659077168,
      "rewards/logprob_reward/std": 0.005231354385614395,
      "step": 1356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 682.125,
      "completions/mean_terminated_length": 671.0967407226562,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 4.188271604938271,
      "grad_norm": 2.167518236895074,
      "kl": 0.24072265625,
      "learning_rate": 3.403727602516554e-08,
      "loss": -0.4284,
      "num_tokens": 38822296.0,
      "reward": 0.06536111980676651,
      "reward_std": 0.049857065081596375,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.003179023740813136,
      "rewards/logprob_reward/std": 0.004562661051750183,
      "step": 1357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 694.8125,
      "completions/mean_terminated_length": 684.1935424804688,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.191358024691358,
      "grad_norm": 2.097909096541004,
      "kl": 0.234375,
      "learning_rate": 3.3785866868027426e-08,
      "loss": -0.2141,
      "num_tokens": 38851246.0,
      "reward": 0.03531426936388016,
      "reward_std": 0.03942541778087616,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0010436322772875428,
      "rewards/logprob_reward/std": 0.0015748720616102219,
      "step": 1358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 657.5,
      "completions/mean_terminated_length": 645.6774291992188,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 4.194444444444445,
      "grad_norm": 2.4790533056536614,
      "kl": 0.2320556640625,
      "learning_rate": 3.353532234289849e-08,
      "loss": -0.0841,
      "num_tokens": 38878366.0,
      "reward": 0.06946359574794769,
      "reward_std": 0.039860717952251434,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0007928848499432206,
      "rewards/logprob_reward/std": 0.0010307712946087122,
      "step": 1359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 730.59375,
      "completions/mean_terminated_length": 688.6785888671875,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 4.197530864197531,
      "grad_norm": 1.8830012574848274,
      "kl": 0.2530517578125,
      "learning_rate": 3.3285643451696796e-08,
      "loss": -0.0983,
      "num_tokens": 38908225.0,
      "reward": 0.026378247886896133,
      "reward_std": 0.021089550107717514,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0015313858166337013,
      "rewards/logprob_reward/std": 0.0027785678394138813,
      "step": 1360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 956.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 683.96875,
      "completions/mean_terminated_length": 683.96875,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 4.200617283950617,
      "grad_norm": 1.966890784894002,
      "kl": 0.2239990234375,
      "learning_rate": 3.303683119287859e-08,
      "loss": -0.16,
      "num_tokens": 38936436.0,
      "reward": 0.0617934986948967,
      "reward_std": 0.04686152935028076,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0026872195303440094,
      "rewards/logprob_reward/std": 0.0031066283117979765,
      "step": 1361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 718.90625,
      "completions/mean_terminated_length": 698.5667114257812,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 4.203703703703703,
      "grad_norm": 1.8711672569110238,
      "kl": 0.2818603515625,
      "learning_rate": 3.278888656143453e-08,
      "loss": -0.3152,
      "num_tokens": 38965705.0,
      "reward": 0.04152519628405571,
      "reward_std": 0.04094218462705612,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0010002164635807276,
      "rewards/logprob_reward/std": 0.001981014385819435,
      "step": 1362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 697.125,
      "completions/mean_terminated_length": 675.3333740234375,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 4.20679012345679,
      "grad_norm": 2.076659231615485,
      "kl": 0.2169189453125,
      "learning_rate": 3.254181054888569e-08,
      "loss": -0.1851,
      "num_tokens": 38994201.0,
      "reward": 0.054154179990291595,
      "reward_std": 0.053396470844745636,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0011435305932536721,
      "rewards/logprob_reward/std": 0.0014338643522933125,
      "step": 1363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 749.90625,
      "completions/mean_terminated_length": 710.7500610351562,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 4.209876543209877,
      "grad_norm": 2.8981736423388247,
      "kl": 0.23486328125,
      "learning_rate": 3.2295604143279534e-08,
      "loss": -0.3005,
      "num_tokens": 39024930.0,
      "reward": 0.03994644433259964,
      "reward_std": 0.048851221799850464,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002718270756304264,
      "rewards/logprob_reward/std": 0.00442732498049736,
      "step": 1364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 667.96875,
      "completions/mean_terminated_length": 656.4838256835938,
      "completions/min_length": 503.0,
      "completions/min_terminated_length": 503.0,
      "epoch": 4.212962962962963,
      "grad_norm": 2.030662329879176,
      "kl": 0.2457275390625,
      "learning_rate": 3.205026832918606e-08,
      "loss": -0.2063,
      "num_tokens": 39052381.0,
      "reward": 0.06386682391166687,
      "reward_std": 0.047239288687705994,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0015186977107077837,
      "rewards/logprob_reward/std": 0.0020372355356812477,
      "step": 1365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 734.4375,
      "completions/mean_terminated_length": 704.4827270507812,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 4.216049382716049,
      "grad_norm": 1.888684409060091,
      "kl": 0.242919921875,
      "learning_rate": 3.1805804087693676e-08,
      "loss": -0.1763,
      "num_tokens": 39082511.0,
      "reward": 0.03575155884027481,
      "reward_std": 0.03975746035575867,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0015295101329684258,
      "rewards/logprob_reward/std": 0.002644570544362068,
      "step": 1366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 993.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 700.25,
      "completions/mean_terminated_length": 700.25,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 4.219135802469136,
      "grad_norm": 2.370034597374273,
      "kl": 0.21728515625,
      "learning_rate": 3.156221239640558e-08,
      "loss": -0.1534,
      "num_tokens": 39111487.0,
      "reward": 0.04550845921039581,
      "reward_std": 0.04069419205188751,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001953842118382454,
      "rewards/logprob_reward/std": 0.0026862630620598793,
      "step": 1367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 701.59375,
      "completions/mean_terminated_length": 680.1000366210938,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 4.222222222222222,
      "grad_norm": 2.271384703623736,
      "kl": 0.261474609375,
      "learning_rate": 3.13194942294355e-08,
      "loss": -0.1969,
      "num_tokens": 39140682.0,
      "reward": 0.030107038095593452,
      "reward_std": 0.04138566926121712,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.002202265430241823,
      "rewards/logprob_reward/std": 0.0035037552006542683,
      "step": 1368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 745.6875,
      "completions/mean_terminated_length": 727.1333618164062,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 4.2253086419753085,
      "grad_norm": 1.8019759463625786,
      "kl": 0.22705078125,
      "learning_rate": 3.1077650557404076e-08,
      "loss": -0.1772,
      "num_tokens": 39171316.0,
      "reward": 0.047880593687295914,
      "reward_std": 0.036105748265981674,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0011173278326168656,
      "rewards/logprob_reward/std": 0.001846394268795848,
      "step": 1369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 706.21875,
      "completions/mean_terminated_length": 695.9677124023438,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 4.228395061728395,
      "grad_norm": 1.9308349370910018,
      "kl": 0.3421630859375,
      "learning_rate": 3.083668234743489e-08,
      "loss": -0.2876,
      "num_tokens": 39200239.0,
      "reward": 0.03226442262530327,
      "reward_std": 0.02791484259068966,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0011271354742348194,
      "rewards/logprob_reward/std": 0.00173663510940969,
      "step": 1370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1000.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 669.8125,
      "completions/mean_terminated_length": 669.8125,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 4.231481481481482,
      "grad_norm": 3.255575472138576,
      "kl": 0.2352294921875,
      "learning_rate": 3.059659056315053e-08,
      "loss": -0.189,
      "num_tokens": 39227765.0,
      "reward": 0.04805910587310791,
      "reward_std": 0.04100500047206879,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0013156697386875749,
      "rewards/logprob_reward/std": 0.0016503233928233385,
      "step": 1371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 900.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 645.96875,
      "completions/mean_terminated_length": 645.96875,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 4.234567901234568,
      "grad_norm": 2.1378184413681165,
      "kl": 0.296875,
      "learning_rate": 3.035737616466885e-08,
      "loss": -0.092,
      "num_tokens": 39254680.0,
      "reward": 0.0521601140499115,
      "reward_std": 0.039865702390670776,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002400127239525318,
      "rewards/logprob_reward/std": 0.0025403748732060194,
      "step": 1372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 699.28125,
      "completions/mean_terminated_length": 677.6333618164062,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 4.237654320987654,
      "grad_norm": 1.8555821513419986,
      "kl": 0.2142333984375,
      "learning_rate": 3.0119040108598974e-08,
      "loss": -0.0071,
      "num_tokens": 39283413.0,
      "reward": 0.03886808454990387,
      "reward_std": 0.02776341140270233,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.001520095276646316,
      "rewards/logprob_reward/std": 0.0023669328074902296,
      "step": 1373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 695.71875,
      "completions/mean_terminated_length": 673.8333740234375,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 4.2407407407407405,
      "grad_norm": 1.9709080116462059,
      "kl": 0.2943115234375,
      "learning_rate": 2.98815833480377e-08,
      "loss": -0.0631,
      "num_tokens": 39312128.0,
      "reward": 0.05827533081173897,
      "reward_std": 0.0337703675031662,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002250362653285265,
      "rewards/logprob_reward/std": 0.0022385185584425926,
      "step": 1374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 712.5,
      "completions/mean_terminated_length": 702.4515991210938,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 4.243827160493828,
      "grad_norm": 2.245351937601914,
      "kl": 0.2398681640625,
      "learning_rate": 2.964500683256549e-08,
      "loss": -0.1131,
      "num_tokens": 39341464.0,
      "reward": 0.04220541566610336,
      "reward_std": 0.04653865844011307,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0017560124397277832,
      "rewards/logprob_reward/std": 0.0026445803232491016,
      "step": 1375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 782.03125,
      "completions/mean_terminated_length": 747.4642944335938,
      "completions/min_length": 519.0,
      "completions/min_terminated_length": 519.0,
      "epoch": 4.246913580246914,
      "grad_norm": 2.8433615922445834,
      "kl": 0.2310791015625,
      "learning_rate": 2.9409311508242663e-08,
      "loss": -0.2622,
      "num_tokens": 39373565.0,
      "reward": 0.05018014460802078,
      "reward_std": 0.04673204571008682,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003672380466014147,
      "rewards/logprob_reward/std": 0.00520910881459713,
      "step": 1376
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 693.0,
      "completions/mean_terminated_length": 616.6154174804688,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 4.25,
      "grad_norm": 1.808520915238301,
      "kl": NaN,
      "learning_rate": 2.9174498317605794e-08,
      "loss": -0.1076,
      "num_tokens": 39402245.0,
      "reward": 0.051088765263557434,
      "reward_std": 0.03256085515022278,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.001209740643389523,
      "rewards/logprob_reward/std": 0.0015752612380310893,
      "step": 1377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 734.09375,
      "completions/mean_terminated_length": 704.1034545898438,
      "completions/min_length": 515.0,
      "completions/min_terminated_length": 515.0,
      "epoch": 4.253086419753086,
      "grad_norm": 1.7529749814005902,
      "kl": 0.210205078125,
      "learning_rate": 2.894056819966384e-08,
      "loss": -0.0914,
      "num_tokens": 39432356.0,
      "reward": 0.046020276844501495,
      "reward_std": 0.04837331920862198,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0025225291028618813,
      "rewards/logprob_reward/std": 0.0031702974811196327,
      "step": 1378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1013.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 702.46875,
      "completions/mean_terminated_length": 702.46875,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 4.256172839506172,
      "grad_norm": 2.1795713564780277,
      "kl": 0.22412109375,
      "learning_rate": 2.8707522089894354e-08,
      "loss": -0.1041,
      "num_tokens": 39461255.0,
      "reward": 0.05204486846923828,
      "reward_std": 0.055112261325120926,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002272073645144701,
      "rewards/logprob_reward/std": 0.003844283288344741,
      "step": 1379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 961.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 678.46875,
      "completions/mean_terminated_length": 678.46875,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 4.2592592592592595,
      "grad_norm": 2.391937387997386,
      "kl": 0.2279052734375,
      "learning_rate": 2.8475360920239723e-08,
      "loss": -0.1423,
      "num_tokens": 39489006.0,
      "reward": 0.04850921779870987,
      "reward_std": 0.053589120507240295,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0018157969461753964,
      "rewards/logprob_reward/std": 0.0021827947348356247,
      "step": 1380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 671.78125,
      "completions/mean_terminated_length": 660.4193115234375,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 4.262345679012346,
      "grad_norm": 2.406046061001516,
      "kl": 0.2579345703125,
      "learning_rate": 2.8244085619103546e-08,
      "loss": -0.218,
      "num_tokens": 39516895.0,
      "reward": 0.056279636919498444,
      "reward_std": 0.0488351508975029,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0035051533486694098,
      "rewards/logprob_reward/std": 0.004182832781225443,
      "step": 1381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 757.75,
      "completions/mean_terminated_length": 696.3077392578125,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 4.265432098765432,
      "grad_norm": 1.904087467486132,
      "kl": 0.210205078125,
      "learning_rate": 2.8013697111346906e-08,
      "loss": -0.1864,
      "num_tokens": 39548031.0,
      "reward": 0.05306212604045868,
      "reward_std": 0.0427543930709362,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.006874579470604658,
      "rewards/logprob_reward/std": 0.011082598939538002,
      "step": 1382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 756.5625,
      "completions/mean_terminated_length": 707.0370483398438,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 4.268518518518518,
      "grad_norm": 1.6790001394875598,
      "kl": 0.2911376953125,
      "learning_rate": 2.778419631828463e-08,
      "loss": -0.2758,
      "num_tokens": 39578361.0,
      "reward": 0.03895171731710434,
      "reward_std": 0.03504975885152817,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.001613016938790679,
      "rewards/logprob_reward/std": 0.0025580748915672302,
      "step": 1383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 770.625,
      "completions/mean_terminated_length": 753.7333984375,
      "completions/min_length": 548.0,
      "completions/min_terminated_length": 548.0,
      "epoch": 4.271604938271605,
      "grad_norm": 2.083072600862685,
      "kl": 0.22509765625,
      "learning_rate": 2.755558415768147e-08,
      "loss": -0.2419,
      "num_tokens": 39609649.0,
      "reward": 0.046697311103343964,
      "reward_std": 0.051193609833717346,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003274786751717329,
      "rewards/logprob_reward/std": 0.006480569951236248,
      "step": 1384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 706.46875,
      "completions/mean_terminated_length": 685.300048828125,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 4.2746913580246915,
      "grad_norm": 1.883983869269119,
      "kl": 0.2314453125,
      "learning_rate": 2.732786154374869e-08,
      "loss": -0.1087,
      "num_tokens": 39638088.0,
      "reward": 0.06074865534901619,
      "reward_std": 0.041123319417238235,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015262835659086704,
      "rewards/logprob_reward/std": 0.001343188458122313,
      "step": 1385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 734.875,
      "completions/mean_terminated_length": 704.9655151367188,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 4.277777777777778,
      "grad_norm": 2.012086222635098,
      "kl": 0.21826171875,
      "learning_rate": 2.7101029387140318e-08,
      "loss": -0.064,
      "num_tokens": 39667924.0,
      "reward": 0.04400114715099335,
      "reward_std": 0.04902297258377075,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0037512709386646748,
      "rewards/logprob_reward/std": 0.004525472410023212,
      "step": 1386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 770.03125,
      "completions/mean_terminated_length": 743.7586059570312,
      "completions/min_length": 565.0,
      "completions/min_terminated_length": 565.0,
      "epoch": 4.280864197530864,
      "grad_norm": 2.3338081925878735,
      "kl": 0.1871337890625,
      "learning_rate": 2.6875088594949387e-08,
      "loss": -0.3447,
      "num_tokens": 39698941.0,
      "reward": 0.03246118873357773,
      "reward_std": 0.04250557720661163,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0013457657769322395,
      "rewards/logprob_reward/std": 0.0020066280849277973,
      "step": 1387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 909.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 646.9375,
      "completions/mean_terminated_length": 646.9375,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 4.283950617283951,
      "grad_norm": 2.157794785448398,
      "kl": 0.2391357421875,
      "learning_rate": 2.6650040070704484e-08,
      "loss": -0.1648,
      "num_tokens": 39725727.0,
      "reward": 0.05416993424296379,
      "reward_std": 0.04828999191522598,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0011610384099185467,
      "rewards/logprob_reward/std": 0.0022571422159671783,
      "step": 1388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 705.625,
      "completions/mean_terminated_length": 684.4000244140625,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.287037037037037,
      "grad_norm": 1.7919849200281552,
      "kl": 0.302001953125,
      "learning_rate": 2.6425884714365966e-08,
      "loss": -0.1799,
      "num_tokens": 39755059.0,
      "reward": 0.04657496511936188,
      "reward_std": 0.03497311845421791,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0031388518400490284,
      "rewards/logprob_reward/std": 0.007875935174524784,
      "step": 1389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 733.0,
      "completions/mean_terminated_length": 713.6000366210938,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 4.290123456790123,
      "grad_norm": 2.5103468795796444,
      "kl": 0.228515625,
      "learning_rate": 2.6202623422322546e-08,
      "loss": -0.3152,
      "num_tokens": 39784891.0,
      "reward": 0.04510669782757759,
      "reward_std": 0.04089469462633133,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001507441746070981,
      "rewards/logprob_reward/std": 0.0021598199382424355,
      "step": 1390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 633.53125,
      "completions/mean_terminated_length": 620.9354858398438,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 4.29320987654321,
      "grad_norm": 3.7293156285813556,
      "kl": 0.2447509765625,
      "learning_rate": 2.5980257087387546e-08,
      "loss": -0.2582,
      "num_tokens": 39811096.0,
      "reward": 0.06991346180438995,
      "reward_std": 0.03495609015226364,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0012927292846143246,
      "rewards/logprob_reward/std": 0.0014992320211604238,
      "step": 1391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 724.4375,
      "completions/mean_terminated_length": 704.4666748046875,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 4.296296296296296,
      "grad_norm": 2.0659177773554687,
      "kl": 0.226318359375,
      "learning_rate": 2.5758786598795325e-08,
      "loss": -0.3243,
      "num_tokens": 39841130.0,
      "reward": 0.04882894828915596,
      "reward_std": 0.04868233948945999,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0021710540167987347,
      "rewards/logprob_reward/std": 0.002986626233905554,
      "step": 1392
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 929.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 679.84375,
      "completions/mean_terminated_length": 679.84375,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 4.299382716049383,
      "grad_norm": 1.8252166403646315,
      "kl": NaN,
      "learning_rate": 2.5538212842197926e-08,
      "loss": -0.1682,
      "num_tokens": 39869257.0,
      "reward": 0.05994633585214615,
      "reward_std": 0.02662118338048458,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0006348142633214593,
      "rewards/logprob_reward/std": 0.0010241686832159758,
      "step": 1393
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 745.875,
      "completions/mean_terminated_length": 717.1034545898438,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 4.302469135802469,
      "grad_norm": 3.0771552623544185,
      "kl": NaN,
      "learning_rate": 2.5318536699661246e-08,
      "loss": -0.3404,
      "num_tokens": 39900081.0,
      "reward": 0.045463353395462036,
      "reward_std": 0.03326955810189247,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001903725671581924,
      "rewards/logprob_reward/std": 0.0034816248808056116,
      "step": 1394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 671.9375,
      "completions/mean_terminated_length": 635.5172119140625,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 4.305555555555555,
      "grad_norm": 2.067104103791924,
      "kl": 0.2216796875,
      "learning_rate": 2.5099759049661802e-08,
      "loss": -0.057,
      "num_tokens": 39928071.0,
      "reward": 0.04660975933074951,
      "reward_std": 0.040935173630714417,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003177509643137455,
      "rewards/logprob_reward/std": 0.0034811077639460564,
      "step": 1395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 689.46875,
      "completions/mean_terminated_length": 678.6774291992188,
      "completions/min_length": 498.0,
      "completions/min_terminated_length": 498.0,
      "epoch": 4.308641975308642,
      "grad_norm": 1.9975501121958554,
      "kl": 0.2283935546875,
      "learning_rate": 2.4881880767083002e-08,
      "loss": -0.1026,
      "num_tokens": 39956278.0,
      "reward": 0.05892914906144142,
      "reward_std": 0.047882527112960815,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00297682941891253,
      "rewards/logprob_reward/std": 0.0047245929017663,
      "step": 1396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 796.3125,
      "completions/mean_terminated_length": 781.1333618164062,
      "completions/min_length": 535.0,
      "completions/min_terminated_length": 535.0,
      "epoch": 4.311728395061729,
      "grad_norm": 2.8479114820602693,
      "kl": 0.21240234375,
      "learning_rate": 2.4664902723211674e-08,
      "loss": -0.3645,
      "num_tokens": 39988672.0,
      "reward": 0.043073683977127075,
      "reward_std": 0.054729074239730835,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0027207641396671534,
      "rewards/logprob_reward/std": 0.002963118953630328,
      "step": 1397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 736.90625,
      "completions/mean_terminated_length": 707.2069091796875,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 4.314814814814815,
      "grad_norm": 2.2835337282673587,
      "kl": 0.2237548828125,
      "learning_rate": 2.444882578573476e-08,
      "loss": -0.3028,
      "num_tokens": 40018517.0,
      "reward": 0.04196564108133316,
      "reward_std": 0.04831092804670334,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0014895980712026358,
      "rewards/logprob_reward/std": 0.002160093979910016,
      "step": 1398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 705.09375,
      "completions/mean_terminated_length": 659.5357666015625,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 4.317901234567901,
      "grad_norm": 1.7934255303390536,
      "kl": 0.248046875,
      "learning_rate": 2.4233650818735573e-08,
      "loss": -0.1481,
      "num_tokens": 40047524.0,
      "reward": 0.04166494309902191,
      "reward_std": 0.04144533723592758,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0011554912198334932,
      "rewards/logprob_reward/std": 0.004127013962715864,
      "step": 1399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 693.71875,
      "completions/mean_terminated_length": 683.0645141601562,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 4.320987654320987,
      "grad_norm": 2.318450278702292,
      "kl": 0.2708740234375,
      "learning_rate": 2.401937868269058e-08,
      "loss": -0.1239,
      "num_tokens": 40076567.0,
      "reward": 0.03664903715252876,
      "reward_std": 0.04152647405862808,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0025267070159316063,
      "rewards/logprob_reward/std": 0.003993797581642866,
      "step": 1400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 702.96875,
      "completions/mean_terminated_length": 692.6128540039062,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 4.324074074074074,
      "grad_norm": 2.0341144644978972,
      "kl": 0.23876953125,
      "learning_rate": 2.380601023446577e-08,
      "loss": -0.1947,
      "num_tokens": 40105406.0,
      "reward": 0.05468962714076042,
      "reward_std": 0.04895017668604851,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0017384699312970042,
      "rewards/logprob_reward/std": 0.0022966843098402023,
      "step": 1401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 690.03125,
      "completions/mean_terminated_length": 667.7667236328125,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 4.327160493827161,
      "grad_norm": 2.171140198917671,
      "kl": 0.221923828125,
      "learning_rate": 2.3593546327313364e-08,
      "loss": -0.2481,
      "num_tokens": 40133907.0,
      "reward": 0.04149216040968895,
      "reward_std": 0.046369604766368866,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0009635099559091032,
      "rewards/logprob_reward/std": 0.0016679001273587346,
      "step": 1402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1014.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 684.90625,
      "completions/mean_terminated_length": 684.90625,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 4.330246913580247,
      "grad_norm": 2.0959285786124457,
      "kl": 0.2274169921875,
      "learning_rate": 2.338198781086842e-08,
      "loss": -0.2086,
      "num_tokens": 40161580.0,
      "reward": 0.04514283686876297,
      "reward_std": 0.04740273579955101,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0015475992113351822,
      "rewards/logprob_reward/std": 0.0022797882556915283,
      "step": 1403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 740.25,
      "completions/mean_terminated_length": 731.0967407226562,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 4.333333333333333,
      "grad_norm": 3.032831550053061,
      "kl": 0.24462890625,
      "learning_rate": 2.317133553114525e-08,
      "loss": -0.2835,
      "num_tokens": 40191800.0,
      "reward": 0.029408439993858337,
      "reward_std": 0.03851061314344406,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.001426044967956841,
      "rewards/logprob_reward/std": 0.002517278306186199,
      "step": 1404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 775.6875,
      "completions/mean_terminated_length": 767.6773681640625,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 4.33641975308642,
      "grad_norm": 2.115883231576716,
      "kl": 0.2034912109375,
      "learning_rate": 2.2961590330534298e-08,
      "loss": -0.2766,
      "num_tokens": 40223006.0,
      "reward": 0.04527488350868225,
      "reward_std": 0.045665763318538666,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001694312784820795,
      "rewards/logprob_reward/std": 0.0019908961839973927,
      "step": 1405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 734.28125,
      "completions/mean_terminated_length": 714.9666748046875,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 4.339506172839506,
      "grad_norm": 2.0970833008940555,
      "kl": 0.2313232421875,
      "learning_rate": 2.2752753047798502e-08,
      "loss": -0.1546,
      "num_tokens": 40253323.0,
      "reward": 0.03802924603223801,
      "reward_std": 0.04966511204838753,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.004060272127389908,
      "rewards/logprob_reward/std": 0.0048335883766412735,
      "step": 1406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 680.25,
      "completions/mean_terminated_length": 644.6896362304688,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 4.342592592592593,
      "grad_norm": 2.4367237598130904,
      "kl": 0.2589111328125,
      "learning_rate": 2.2544824518070104e-08,
      "loss": -0.2076,
      "num_tokens": 40281567.0,
      "reward": 0.05269753560423851,
      "reward_std": 0.0432589054107666,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0029972584452480078,
      "rewards/logprob_reward/std": 0.004715714603662491,
      "step": 1407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1009.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 703.1875,
      "completions/mean_terminated_length": 703.1875,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 4.345679012345679,
      "grad_norm": 2.2303496249660135,
      "kl": 0.2296142578125,
      "learning_rate": 2.2337805572847425e-08,
      "loss": -0.2227,
      "num_tokens": 40310705.0,
      "reward": 0.05487576127052307,
      "reward_std": 0.05081789195537567,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0019452865235507488,
      "rewards/logprob_reward/std": 0.0020653873216360807,
      "step": 1408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 744.8125,
      "completions/mean_terminated_length": 726.2000122070312,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 4.348765432098766,
      "grad_norm": 2.095914288307202,
      "kl": 0.2137451171875,
      "learning_rate": 2.2131697039991127e-08,
      "loss": -0.2036,
      "num_tokens": 40340931.0,
      "reward": 0.04556164890527725,
      "reward_std": 0.05527123063802719,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0020129410549998283,
      "rewards/logprob_reward/std": 0.0027564240153878927,
      "step": 1409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 822.25,
      "completions/mean_terminated_length": 784.888916015625,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 4.351851851851852,
      "grad_norm": 2.64787034770904,
      "kl": 0.202392578125,
      "learning_rate": 2.1926499743721405e-08,
      "loss": -0.2434,
      "num_tokens": 40374287.0,
      "reward": 0.037518374621868134,
      "reward_std": 0.049233656376600266,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.003492637537419796,
      "rewards/logprob_reward/std": 0.007689587771892548,
      "step": 1410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 952.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 686.96875,
      "completions/mean_terminated_length": 686.96875,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 4.354938271604938,
      "grad_norm": 1.821016041915507,
      "kl": 0.2220458984375,
      "learning_rate": 2.1722214504614313e-08,
      "loss": -0.1042,
      "num_tokens": 40402726.0,
      "reward": 0.05894976854324341,
      "reward_std": 0.03396419435739517,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0029997399542480707,
      "rewards/logprob_reward/std": 0.004429431166499853,
      "step": 1411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 710.96875,
      "completions/mean_terminated_length": 700.8709716796875,
      "completions/min_length": 355.0,
      "completions/min_terminated_length": 355.0,
      "epoch": 4.3580246913580245,
      "grad_norm": 3.34726012963772,
      "kl": 0.212646484375,
      "learning_rate": 2.1518842139598674e-08,
      "loss": -0.4076,
      "num_tokens": 40431489.0,
      "reward": 0.04314707964658737,
      "reward_std": 0.045407362282276154,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002802309114485979,
      "rewards/logprob_reward/std": 0.00475440826267004,
      "step": 1412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1008.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 731.3125,
      "completions/mean_terminated_length": 731.3125,
      "completions/min_length": 527.0,
      "completions/min_terminated_length": 527.0,
      "epoch": 4.361111111111111,
      "grad_norm": 2.2836553819565273,
      "kl": 0.2249755859375,
      "learning_rate": 2.1316383461952804e-08,
      "loss": -0.4415,
      "num_tokens": 40461763.0,
      "reward": 0.03941406309604645,
      "reward_std": 0.04996800422668457,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0021267374977469444,
      "rewards/logprob_reward/std": 0.003304164856672287,
      "step": 1413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 722.65625,
      "completions/mean_terminated_length": 691.4827270507812,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 4.364197530864198,
      "grad_norm": 2.596303238393781,
      "kl": 0.252197265625,
      "learning_rate": 2.1114839281301143e-08,
      "loss": -0.2563,
      "num_tokens": 40491432.0,
      "reward": 0.033059924840927124,
      "reward_std": 0.03590691089630127,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0020110271871089935,
      "rewards/logprob_reward/std": 0.0032212280202656984,
      "step": 1414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 743.03125,
      "completions/mean_terminated_length": 724.300048828125,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 4.367283950617284,
      "grad_norm": 2.092102307553979,
      "kl": 0.222412109375,
      "learning_rate": 2.0914210403611132e-08,
      "loss": -0.1412,
      "num_tokens": 40521597.0,
      "reward": 0.04807814210653305,
      "reward_std": 0.04798971116542816,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0013368211220949888,
      "rewards/logprob_reward/std": 0.002446343656629324,
      "step": 1415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 713.5625,
      "completions/mean_terminated_length": 692.86669921875,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 4.37037037037037,
      "grad_norm": 3.123376111092823,
      "kl": 0.2293701171875,
      "learning_rate": 2.071449763118993e-08,
      "loss": -0.3544,
      "num_tokens": 40551099.0,
      "reward": 0.052133187651634216,
      "reward_std": 0.04033924266695976,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002370205707848072,
      "rewards/logprob_reward/std": 0.0027446511667221785,
      "step": 1416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 749.125,
      "completions/mean_terminated_length": 740.258056640625,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.3734567901234565,
      "grad_norm": 1.9764372852447862,
      "kl": 0.2183837890625,
      "learning_rate": 2.0515701762681304e-08,
      "loss": -0.1034,
      "num_tokens": 40581775.0,
      "reward": 0.04872114956378937,
      "reward_std": 0.04157783091068268,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0020512789487838745,
      "rewards/logprob_reward/std": 0.0028042506892234087,
      "step": 1417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 687.53125,
      "completions/mean_terminated_length": 652.72412109375,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.3765432098765435,
      "grad_norm": 2.365794988097494,
      "kl": 0.240966796875,
      "learning_rate": 2.0317823593062165e-08,
      "loss": -0.2624,
      "num_tokens": 40610432.0,
      "reward": 0.04879935085773468,
      "reward_std": 0.046986665576696396,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0021381659898906946,
      "rewards/logprob_reward/std": 0.002953203860670328,
      "step": 1418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 820.0,
      "completions/mean_length": 675.34375,
      "completions/mean_terminated_length": 639.27587890625,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 4.37962962962963,
      "grad_norm": 2.9093568539631067,
      "kl": 0.250244140625,
      "learning_rate": 2.0120863913639874e-08,
      "loss": -0.364,
      "num_tokens": 40638335.0,
      "reward": 0.045894354581832886,
      "reward_std": 0.04815865308046341,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0023826141841709614,
      "rewards/logprob_reward/std": 0.003841497004032135,
      "step": 1419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 768.65625,
      "completions/mean_terminated_length": 732.1785888671875,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 4.382716049382716,
      "grad_norm": 2.0744272803030923,
      "kl": 0.1971435546875,
      "learning_rate": 1.9924823512048438e-08,
      "loss": -0.0882,
      "num_tokens": 40669696.0,
      "reward": 0.026432940736413002,
      "reward_std": 0.03985881432890892,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0015921569429337978,
      "rewards/logprob_reward/std": 0.0023353670258075,
      "step": 1420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 952.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 682.5625,
      "completions/mean_terminated_length": 682.5625,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 4.385802469135802,
      "grad_norm": 2.4074119485832135,
      "kl": 0.2091064453125,
      "learning_rate": 1.972970317224601e-08,
      "loss": -0.2167,
      "num_tokens": 40698014.0,
      "reward": 0.04284990206360817,
      "reward_std": 0.052097100764513016,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0024721133522689342,
      "rewards/logprob_reward/std": 0.0028523337095975876,
      "step": 1421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 738.46875,
      "completions/mean_terminated_length": 697.6785888671875,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 4.388888888888889,
      "grad_norm": 1.7022079121164682,
      "kl": 0.2012939453125,
      "learning_rate": 1.9535503674511263e-08,
      "loss": -0.0228,
      "num_tokens": 40728421.0,
      "reward": 0.03993780165910721,
      "reward_std": 0.04407007247209549,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002708667889237404,
      "rewards/logprob_reward/std": 0.006404683459550142,
      "step": 1422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 756.03125,
      "completions/mean_terminated_length": 738.1666870117188,
      "completions/min_length": 540.0,
      "completions/min_terminated_length": 540.0,
      "epoch": 4.3919753086419755,
      "grad_norm": 2.98292647074308,
      "kl": 0.2384033203125,
      "learning_rate": 1.934222579544059e-08,
      "loss": -0.2068,
      "num_tokens": 40759526.0,
      "reward": 0.029373236000537872,
      "reward_std": 0.044831715524196625,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0013869295362383127,
      "rewards/logprob_reward/std": 0.0021238508634269238,
      "step": 1423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 742.90625,
      "completions/mean_terminated_length": 713.8275756835938,
      "completions/min_length": 513.0,
      "completions/min_terminated_length": 513.0,
      "epoch": 4.395061728395062,
      "grad_norm": 1.9792308065221802,
      "kl": 0.2098388671875,
      "learning_rate": 1.9149870307944765e-08,
      "loss": -0.0489,
      "num_tokens": 40789767.0,
      "reward": 0.03917968273162842,
      "reward_std": 0.04533226788043976,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0018663115333765745,
      "rewards/logprob_reward/std": 0.0018212543800473213,
      "step": 1424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 903.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 680.84375,
      "completions/mean_terminated_length": 680.84375,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.398148148148148,
      "grad_norm": 1.8176212772817746,
      "kl": 0.2724609375,
      "learning_rate": 1.895843798124605e-08,
      "loss": -0.2014,
      "num_tokens": 40817602.0,
      "reward": 0.044841744005680084,
      "reward_std": 0.03986024111509323,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0012130499817430973,
      "rewards/logprob_reward/std": 0.0013913688017055392,
      "step": 1425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 684.6875,
      "completions/mean_terminated_length": 673.741943359375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 4.401234567901234,
      "grad_norm": 1.8734393836718208,
      "kl": 0.230224609375,
      "learning_rate": 1.8767929580874863e-08,
      "loss": -0.2068,
      "num_tokens": 40845968.0,
      "reward": 0.0494350790977478,
      "reward_std": 0.03326744586229324,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0028445315547287464,
      "rewards/logprob_reward/std": 0.003680290887132287,
      "step": 1426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 731.96875,
      "completions/mean_terminated_length": 722.54833984375,
      "completions/min_length": 537.0,
      "completions/min_terminated_length": 537.0,
      "epoch": 4.404320987654321,
      "grad_norm": 1.83093813038457,
      "kl": 0.22265625,
      "learning_rate": 1.8578345868666996e-08,
      "loss": -0.1444,
      "num_tokens": 40875659.0,
      "reward": 0.05107133463025093,
      "reward_std": 0.04415426403284073,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0011903708800673485,
      "rewards/logprob_reward/std": 0.0022207444999367,
      "step": 1427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 737.59375,
      "completions/mean_terminated_length": 696.6785888671875,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 4.407407407407407,
      "grad_norm": 2.1189025175553335,
      "kl": 0.2158203125,
      "learning_rate": 1.8389687602760495e-08,
      "loss": -0.1464,
      "num_tokens": 40905902.0,
      "reward": 0.04381987825036049,
      "reward_std": 0.03640786185860634,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0035498631186783314,
      "rewards/logprob_reward/std": 0.0034627187997102737,
      "step": 1428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 779.03125,
      "completions/mean_terminated_length": 744.0357666015625,
      "completions/min_length": 559.0,
      "completions/min_terminated_length": 559.0,
      "epoch": 4.410493827160494,
      "grad_norm": 2.3693258851612358,
      "kl": 0.22314453125,
      "learning_rate": 1.820195553759246e-08,
      "loss": -0.3637,
      "num_tokens": 40937051.0,
      "reward": 0.04203595221042633,
      "reward_std": 0.04921147972345352,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015677246265113354,
      "rewards/logprob_reward/std": 0.0030524819158017635,
      "step": 1429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 783.0,
      "completions/mean_length": 662.8125,
      "completions/mean_terminated_length": 638.7333374023438,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 4.41358024691358,
      "grad_norm": 2.4623191831819566,
      "kl": 0.257080078125,
      "learning_rate": 1.8015150423896203e-08,
      "loss": -0.2101,
      "num_tokens": 40965205.0,
      "reward": 0.05275503545999527,
      "reward_std": 0.044201165437698364,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0030611527618020773,
      "rewards/logprob_reward/std": 0.005246687680482864,
      "step": 1430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 950.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 661.53125,
      "completions/mean_terminated_length": 661.53125,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 4.416666666666667,
      "grad_norm": 1.8984126600052098,
      "kl": 0.2613525390625,
      "learning_rate": 1.782927300869827e-08,
      "loss": -0.0581,
      "num_tokens": 40993094.0,
      "reward": 0.041716672480106354,
      "reward_std": 0.048243045806884766,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.004685189574956894,
      "rewards/logprob_reward/std": 0.005050354637205601,
      "step": 1431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 706.0,
      "completions/mean_terminated_length": 695.741943359375,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 4.419753086419753,
      "grad_norm": 2.2240420842439685,
      "kl": 0.2259521484375,
      "learning_rate": 1.7644324035315212e-08,
      "loss": -0.212,
      "num_tokens": 41022302.0,
      "reward": 0.04929380118846893,
      "reward_std": 0.04793080687522888,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0026875524781644344,
      "rewards/logprob_reward/std": 0.0032262559980154037,
      "step": 1432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 742.46875,
      "completions/mean_terminated_length": 723.7000122070312,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 4.422839506172839,
      "grad_norm": 2.094867202988155,
      "kl": 0.23046875,
      "learning_rate": 1.746030424335093e-08,
      "loss": -0.1275,
      "num_tokens": 41052717.0,
      "reward": 0.05048564821481705,
      "reward_std": 0.04967077821493149,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.004011832177639008,
      "rewards/logprob_reward/std": 0.004363081883639097,
      "step": 1433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 740.5,
      "completions/mean_terminated_length": 721.6000366210938,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 4.425925925925926,
      "grad_norm": 1.8057482736396617,
      "kl": 0.221435546875,
      "learning_rate": 1.7277214368693423e-08,
      "loss": -0.1854,
      "num_tokens": 41082757.0,
      "reward": 0.04497251659631729,
      "reward_std": 0.047094471752643585,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0013583521358668804,
      "rewards/logprob_reward/std": 0.0019999288488179445,
      "step": 1434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 742.90625,
      "completions/mean_terminated_length": 733.8386840820312,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 4.429012345679013,
      "grad_norm": 2.063458027364136,
      "kl": 0.21728515625,
      "learning_rate": 1.7095055143512117e-08,
      "loss": -0.0462,
      "num_tokens": 41113226.0,
      "reward": 0.053193461149930954,
      "reward_std": 0.047015219926834106,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0035482917446643114,
      "rewards/logprob_reward/std": 0.005106258671730757,
      "step": 1435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 721.125,
      "completions/mean_terminated_length": 677.857177734375,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 4.432098765432099,
      "grad_norm": 2.3736942178064284,
      "kl": 0.229248046875,
      "learning_rate": 1.6913827296254736e-08,
      "loss": -0.3049,
      "num_tokens": 41142422.0,
      "reward": 0.04816795140504837,
      "reward_std": 0.05544901639223099,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0014366109389811754,
      "rewards/logprob_reward/std": 0.0024734637700021267,
      "step": 1436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 727.9375,
      "completions/mean_terminated_length": 685.6428833007812,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 4.435185185185185,
      "grad_norm": 2.1933093433517485,
      "kl": 0.2091064453125,
      "learning_rate": 1.6733531551644503e-08,
      "loss": -0.1723,
      "num_tokens": 41172224.0,
      "reward": 0.042622070759534836,
      "reward_std": 0.04842061549425125,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002218965208157897,
      "rewards/logprob_reward/std": 0.0035111114848405123,
      "step": 1437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 702.21875,
      "completions/mean_terminated_length": 691.8386840820312,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.438271604938271,
      "grad_norm": 2.173028707071614,
      "kl": 0.2318115234375,
      "learning_rate": 1.655416863067713e-08,
      "loss": -0.1134,
      "num_tokens": 41201363.0,
      "reward": 0.0569436177611351,
      "reward_std": 0.056063245981931686,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0042429035529494286,
      "rewards/logprob_reward/std": 0.003971173893660307,
      "step": 1438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 691.375,
      "completions/mean_terminated_length": 669.2000122070312,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 4.441358024691358,
      "grad_norm": 1.9605198610090238,
      "kl": 0.2335205078125,
      "learning_rate": 1.637573925061808e-08,
      "loss": -0.1832,
      "num_tokens": 41229963.0,
      "reward": 0.030255049467086792,
      "reward_std": 0.0344797745347023,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.002366722794249654,
      "rewards/logprob_reward/std": 0.002940770238637924,
      "step": 1439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 746.0625,
      "completions/mean_terminated_length": 737.0967407226562,
      "completions/min_length": 537.0,
      "completions/min_terminated_length": 537.0,
      "epoch": 4.444444444444445,
      "grad_norm": 3.097304803998429,
      "kl": 0.2078857421875,
      "learning_rate": 1.6198244124999592e-08,
      "loss": -0.4188,
      "num_tokens": 41260737.0,
      "reward": 0.04235662519931793,
      "reward_std": 0.04883340001106262,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0019240305991843343,
      "rewards/logprob_reward/std": 0.002048332476988435,
      "step": 1440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 722.03125,
      "completions/mean_terminated_length": 678.8928833007812,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "epoch": 4.447530864197531,
      "grad_norm": 2.339085377303552,
      "kl": 0.244140625,
      "learning_rate": 1.6021683963617805e-08,
      "loss": -0.3539,
      "num_tokens": 41290606.0,
      "reward": 0.05370953679084778,
      "reward_std": 0.05755479633808136,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0041217077523469925,
      "rewards/logprob_reward/std": 0.006467892322689295,
      "step": 1441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 769.6875,
      "completions/mean_terminated_length": 733.357177734375,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 4.450617283950617,
      "grad_norm": 2.992079515886702,
      "kl": 0.22802734375,
      "learning_rate": 1.5846059472530122e-08,
      "loss": -0.3015,
      "num_tokens": 41322048.0,
      "reward": 0.036230962723493576,
      "reward_std": 0.04625852778553963,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0020621800795197487,
      "rewards/logprob_reward/std": 0.0030165647622197866,
      "step": 1442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 726.78125,
      "completions/mean_terminated_length": 706.9666748046875,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 4.453703703703704,
      "grad_norm": 1.86928224847978,
      "kl": 0.2103271484375,
      "learning_rate": 1.5671371354051997e-08,
      "loss": -0.056,
      "num_tokens": 41351797.0,
      "reward": 0.06443355232477188,
      "reward_std": 0.04753619059920311,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0021483921445906162,
      "rewards/logprob_reward/std": 0.0018268699059262872,
      "step": 1443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1012.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 714.4375,
      "completions/mean_terminated_length": 714.4375,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 4.45679012345679,
      "grad_norm": 1.9845127790703219,
      "kl": 0.228515625,
      "learning_rate": 1.5497620306754582e-08,
      "loss": -0.0882,
      "num_tokens": 41381399.0,
      "reward": 0.0543513149023056,
      "reward_std": 0.04788962006568909,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0048347944393754005,
      "rewards/logprob_reward/std": 0.005204932298511267,
      "step": 1444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 704.5625,
      "completions/mean_terminated_length": 694.258056640625,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 4.459876543209877,
      "grad_norm": 1.830456933116766,
      "kl": 0.2293701171875,
      "learning_rate": 1.5324807025461656e-08,
      "loss": -0.0102,
      "num_tokens": 41410661.0,
      "reward": 0.06272735446691513,
      "reward_std": 0.041232675313949585,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.003724841633811593,
      "rewards/logprob_reward/std": 0.003868513973429799,
      "step": 1445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 807.0,
      "completions/mean_length": 741.375,
      "completions/mean_terminated_length": 647.1666870117188,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 4.462962962962963,
      "grad_norm": 1.6299599810837322,
      "kl": 0.221923828125,
      "learning_rate": 1.515293220124683e-08,
      "loss": -0.0934,
      "num_tokens": 41440813.0,
      "reward": 0.058781467378139496,
      "reward_std": 0.040851958096027374,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0028127399273216724,
      "rewards/logprob_reward/std": 0.003479891689494252,
      "step": 1446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 657.75,
      "completions/mean_terminated_length": 633.3333740234375,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 4.466049382716049,
      "grad_norm": 1.952834187605714,
      "kl": 0.25390625,
      "learning_rate": 1.498199652143092e-08,
      "loss": -0.0952,
      "num_tokens": 41468229.0,
      "reward": 0.03513897582888603,
      "reward_std": 0.035858407616615295,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0008488652529194951,
      "rewards/logprob_reward/std": 0.0015317859360948205,
      "step": 1447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 713.78125,
      "completions/mean_terminated_length": 693.1000366210938,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 4.469135802469136,
      "grad_norm": 2.1990872625180407,
      "kl": 0.22412109375,
      "learning_rate": 1.4812000669579188e-08,
      "loss": -0.0744,
      "num_tokens": 41497506.0,
      "reward": 0.04628187045454979,
      "reward_std": 0.04848606139421463,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0028131892904639244,
      "rewards/logprob_reward/std": 0.003807139117270708,
      "step": 1448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 719.3125,
      "completions/mean_terminated_length": 687.7930908203125,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 4.472222222222222,
      "grad_norm": 2.709511431835891,
      "kl": 0.2344970703125,
      "learning_rate": 1.4642945325498507e-08,
      "loss": -0.3521,
      "num_tokens": 41526672.0,
      "reward": 0.05511452257633209,
      "reward_std": 0.05433478578925133,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0022105767857283354,
      "rewards/logprob_reward/std": 0.0026712631806731224,
      "step": 1449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 775.1875,
      "completions/mean_terminated_length": 767.1612548828125,
      "completions/min_length": 544.0,
      "completions/min_terminated_length": 544.0,
      "epoch": 4.4753086419753085,
      "grad_norm": 1.846995978285614,
      "kl": 0.2335205078125,
      "learning_rate": 1.4474831165234707e-08,
      "loss": -0.0631,
      "num_tokens": 41558186.0,
      "reward": 0.03356396034359932,
      "reward_std": 0.043709829449653625,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0025710673071444035,
      "rewards/logprob_reward/std": 0.005059478338807821,
      "step": 1450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 716.34375,
      "completions/mean_terminated_length": 695.8333740234375,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 4.478395061728395,
      "grad_norm": 2.5699828993214893,
      "kl": 0.22216796875,
      "learning_rate": 1.4307658861069799e-08,
      "loss": -0.2157,
      "num_tokens": 41587285.0,
      "reward": 0.06077142804861069,
      "reward_std": 0.0416644886136055,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0015515873674303293,
      "rewards/logprob_reward/std": 0.0024837180972099304,
      "step": 1451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 734.71875,
      "completions/mean_terminated_length": 681.1481323242188,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 4.481481481481482,
      "grad_norm": 1.9556211887061505,
      "kl": 0.212890625,
      "learning_rate": 1.414142908151944e-08,
      "loss": -0.0813,
      "num_tokens": 41617472.0,
      "reward": 0.03968391567468643,
      "reward_std": 0.041357140988111496,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0024265716783702374,
      "rewards/logprob_reward/std": 0.003852495225146413,
      "step": 1452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 675.0,
      "completions/mean_terminated_length": 663.741943359375,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 4.484567901234568,
      "grad_norm": 1.8457607288324611,
      "kl": 0.251708984375,
      "learning_rate": 1.3976142491330111e-08,
      "loss": -0.0615,
      "num_tokens": 41645440.0,
      "reward": 0.042487286031246185,
      "reward_std": 0.04263980686664581,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0020692090038210154,
      "rewards/logprob_reward/std": 0.0031202638056129217,
      "step": 1453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 706.28125,
      "completions/mean_terminated_length": 696.0322265625,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.487654320987654,
      "grad_norm": 1.8081229291005072,
      "kl": 0.2325439453125,
      "learning_rate": 1.3811799751476588e-08,
      "loss": -0.1682,
      "num_tokens": 41674581.0,
      "reward": 0.04569516330957413,
      "reward_std": 0.046075932681560516,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002161290729418397,
      "rewards/logprob_reward/std": 0.0034060017205774784,
      "step": 1454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 745.71875,
      "completions/mean_terminated_length": 716.9310302734375,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 4.4907407407407405,
      "grad_norm": 2.086739826205387,
      "kl": 0.2305908203125,
      "learning_rate": 1.3648401519159109e-08,
      "loss": -0.1724,
      "num_tokens": 41705036.0,
      "reward": 0.05555104464292526,
      "reward_std": 0.05617810785770416,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0026956028304994106,
      "rewards/logprob_reward/std": 0.0028205085545778275,
      "step": 1455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 709.15625,
      "completions/mean_terminated_length": 699.0,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 4.493827160493828,
      "grad_norm": 1.8750074933357899,
      "kl": 0.2535400390625,
      "learning_rate": 1.348594844780096e-08,
      "loss": -0.1478,
      "num_tokens": 41734553.0,
      "reward": 0.04616174101829529,
      "reward_std": 0.027845583856105804,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0026797144673764706,
      "rewards/logprob_reward/std": 0.0039057082030922174,
      "step": 1456
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 710.84375,
      "completions/mean_terminated_length": 700.741943359375,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 4.496913580246914,
      "grad_norm": 2.1763105296110963,
      "kl": NaN,
      "learning_rate": 1.332444118704576e-08,
      "loss": -0.0882,
      "num_tokens": 41763700.0,
      "reward": 0.03868158906698227,
      "reward_std": 0.039767876267433167,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0013128730934113264,
      "rewards/logprob_reward/std": 0.002047280315309763,
      "step": 1457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 722.875,
      "completions/mean_terminated_length": 667.1111450195312,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 4.5,
      "grad_norm": 2.353725973027106,
      "kl": 0.238525390625,
      "learning_rate": 1.3163880382754761e-08,
      "loss": -0.2077,
      "num_tokens": 41793028.0,
      "reward": 0.05095834285020828,
      "reward_std": 0.05258800834417343,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0010648233583196998,
      "rewards/logprob_reward/std": 0.001401307643391192,
      "step": 1458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 661.59375,
      "completions/mean_terminated_length": 649.9031982421875,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 4.503086419753086,
      "grad_norm": 2.6339210231801538,
      "kl": 0.266357421875,
      "learning_rate": 1.3004266677004522e-08,
      "loss": -0.1457,
      "num_tokens": 41820539.0,
      "reward": 0.03324113041162491,
      "reward_std": 0.0461549237370491,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0022123625967651606,
      "rewards/logprob_reward/std": 0.0030606554355472326,
      "step": 1459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 694.9375,
      "completions/mean_terminated_length": 684.3225708007812,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 4.506172839506172,
      "grad_norm": 3.6026827986687056,
      "kl": 0.2335205078125,
      "learning_rate": 1.2845600708084076e-08,
      "loss": -0.2701,
      "num_tokens": 41849217.0,
      "reward": 0.04239705204963684,
      "reward_std": 0.04868008941411972,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.001968946773558855,
      "rewards/logprob_reward/std": 0.002376778516918421,
      "step": 1460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 745.96875,
      "completions/mean_terminated_length": 706.2500610351562,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 4.5092592592592595,
      "grad_norm": 2.2401348755430135,
      "kl": 0.2501220703125,
      "learning_rate": 1.2687883110492515e-08,
      "loss": -0.0612,
      "num_tokens": 41879856.0,
      "reward": 0.035798899829387665,
      "reward_std": 0.03567592054605484,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0015821070410311222,
      "rewards/logprob_reward/std": 0.003936965949833393,
      "step": 1461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1000.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 674.34375,
      "completions/mean_terminated_length": 674.34375,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 4.512345679012346,
      "grad_norm": 1.7618294060687367,
      "kl": 0.249267578125,
      "learning_rate": 1.2531114514936491e-08,
      "loss": -0.2088,
      "num_tokens": 41908243.0,
      "reward": 0.049269404262304306,
      "reward_std": 0.042050741612911224,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002660449594259262,
      "rewards/logprob_reward/std": 0.00429100776091218,
      "step": 1462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 657.0,
      "completions/mean_terminated_length": 645.1612548828125,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 4.515432098765432,
      "grad_norm": 2.331651801288181,
      "kl": 0.2408447265625,
      "learning_rate": 1.2375295548327557e-08,
      "loss": -0.0584,
      "num_tokens": 41935819.0,
      "reward": 0.07170861214399338,
      "reward_std": 0.04488102346658707,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.003287344705313444,
      "rewards/logprob_reward/std": 0.004149049986153841,
      "step": 1463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 768.28125,
      "completions/mean_terminated_length": 741.8275756835938,
      "completions/min_length": 535.0,
      "completions/min_terminated_length": 535.0,
      "epoch": 4.518518518518518,
      "grad_norm": 3.652022522373431,
      "kl": 0.218017578125,
      "learning_rate": 1.222042683377983e-08,
      "loss": -0.285,
      "num_tokens": 41966936.0,
      "reward": 0.03882598876953125,
      "reward_std": 0.04753109812736511,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0014733191346749663,
      "rewards/logprob_reward/std": 0.0019834123086184263,
      "step": 1464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 677.59375,
      "completions/mean_terminated_length": 666.4193115234375,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 4.521604938271605,
      "grad_norm": 2.261481769322289,
      "kl": 0.23388671875,
      "learning_rate": 1.2066508990607293e-08,
      "loss": -0.117,
      "num_tokens": 41994939.0,
      "reward": 0.054833658039569855,
      "reward_std": 0.034459188580513,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0018985085189342499,
      "rewards/logprob_reward/std": 0.0021323813125491142,
      "step": 1465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 702.9375,
      "completions/mean_terminated_length": 681.5333862304688,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 4.5246913580246915,
      "grad_norm": 1.9807012356553644,
      "kl": 0.22998046875,
      "learning_rate": 1.1913542634321538e-08,
      "loss": -0.1366,
      "num_tokens": 42024409.0,
      "reward": 0.04338686540722847,
      "reward_std": 0.049047164618968964,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0030687383841723204,
      "rewards/logprob_reward/std": 0.004367289133369923,
      "step": 1466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 700.59375,
      "completions/mean_terminated_length": 690.1612548828125,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 4.527777777777778,
      "grad_norm": 1.927326373765374,
      "kl": 0.2569580078125,
      "learning_rate": 1.1761528376629137e-08,
      "loss": -0.1766,
      "num_tokens": 42053420.0,
      "reward": 0.046754371374845505,
      "reward_std": 0.04168115183711052,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003338192356750369,
      "rewards/logprob_reward/std": 0.005756665021181107,
      "step": 1467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 706.65625,
      "completions/mean_terminated_length": 673.8275756835938,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 4.530864197530864,
      "grad_norm": 5.224802349532697,
      "kl": 0.2313232421875,
      "learning_rate": 1.1610466825429182e-08,
      "loss": -0.4865,
      "num_tokens": 42082273.0,
      "reward": 0.045427750796079636,
      "reward_std": 0.05316973477602005,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0018641665810719132,
      "rewards/logprob_reward/std": 0.002744182012975216,
      "step": 1468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 723.84375,
      "completions/mean_terminated_length": 680.9642944335938,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 4.533950617283951,
      "grad_norm": 2.165261482173354,
      "kl": 0.228759765625,
      "learning_rate": 1.1460358584811091e-08,
      "loss": -0.2355,
      "num_tokens": 42111716.0,
      "reward": 0.05476970225572586,
      "reward_std": 0.04580404981970787,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0018274460453540087,
      "rewards/logprob_reward/std": 0.003082863288000226,
      "step": 1469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 711.34375,
      "completions/mean_terminated_length": 690.5000610351562,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 4.537037037037037,
      "grad_norm": 2.2478714434765923,
      "kl": 0.3079833984375,
      "learning_rate": 1.1311204255051942e-08,
      "loss": -0.3927,
      "num_tokens": 42140659.0,
      "reward": 0.05164527893066406,
      "reward_std": 0.05694463104009628,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.001828084234148264,
      "rewards/logprob_reward/std": 0.0025435511488467455,
      "step": 1470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 998.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 660.09375,
      "completions/mean_terminated_length": 660.09375,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 4.540123456790123,
      "grad_norm": 2.1010252200970534,
      "kl": 0.2474365234375,
      "learning_rate": 1.116300443261417e-08,
      "loss": -0.0856,
      "num_tokens": 42167706.0,
      "reward": 0.05780253931879997,
      "reward_std": 0.047235894948244095,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017250396776944399,
      "rewards/logprob_reward/std": 0.0027445878367871046,
      "step": 1471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 705.125,
      "completions/mean_terminated_length": 694.8386840820312,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 4.54320987654321,
      "grad_norm": 2.129030236758863,
      "kl": 0.2327880859375,
      "learning_rate": 1.1015759710143124e-08,
      "loss": -0.118,
      "num_tokens": 42196554.0,
      "reward": 0.05206025391817093,
      "reward_std": 0.03269369155168533,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002289172261953354,
      "rewards/logprob_reward/std": 0.0026913422625511885,
      "step": 1472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 702.625,
      "completions/mean_terminated_length": 692.258056640625,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 4.546296296296296,
      "grad_norm": 2.0399069526424154,
      "kl": 0.2205810546875,
      "learning_rate": 1.0869470676464848e-08,
      "loss": -0.1414,
      "num_tokens": 42225342.0,
      "reward": 0.04880621284246445,
      "reward_std": 0.0432891882956028,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0021457888651639223,
      "rewards/logprob_reward/std": 0.0022100957576185465,
      "step": 1473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 871.0,
      "completions/mean_length": 736.59375,
      "completions/mean_terminated_length": 706.862060546875,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 4.549382716049383,
      "grad_norm": 2.563073558337134,
      "kl": 0.24822998046875,
      "learning_rate": 1.0724137916583525e-08,
      "loss": -0.2259,
      "num_tokens": 42255797.0,
      "reward": 0.027438707649707794,
      "reward_std": 0.03702589124441147,
      "rewards/format_reward_func/mean": 0.25,
      "rewards/format_reward_func/std": 0.4399413466453552,
      "rewards/logprob_reward/mean": 0.0027096762787550688,
      "rewards/logprob_reward/std": 0.0035238105338066816,
      "step": 1474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 930.0,
      "completions/mean_length": 672.78125,
      "completions/mean_terminated_length": 649.36669921875,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 4.552469135802469,
      "grad_norm": 2.1662622318861273,
      "kl": 0.2301025390625,
      "learning_rate": 1.0579762011679317e-08,
      "loss": -0.2272,
      "num_tokens": 42283374.0,
      "reward": 0.0521787628531456,
      "reward_std": 0.05530615895986557,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002420843578875065,
      "rewards/logprob_reward/std": 0.0027172542177140713,
      "step": 1475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 893.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 647.9375,
      "completions/mean_terminated_length": 647.9375,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 4.555555555555555,
      "grad_norm": 2.5338564161310284,
      "kl": 0.2235107421875,
      "learning_rate": 1.0436343539105857e-08,
      "loss": -0.4645,
      "num_tokens": 42310576.0,
      "reward": 0.05766887590289116,
      "reward_std": 0.047901451587677,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0015765284188091755,
      "rewards/logprob_reward/std": 0.0021711974404752254,
      "step": 1476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 732.125,
      "completions/mean_terminated_length": 701.9310302734375,
      "completions/min_length": 544.0,
      "completions/min_terminated_length": 544.0,
      "epoch": 4.5586419753086425,
      "grad_norm": 1.7270498962110772,
      "kl": 0.1954345703125,
      "learning_rate": 1.0293883072388154e-08,
      "loss": -0.0339,
      "num_tokens": 42340660.0,
      "reward": 0.05205375701189041,
      "reward_std": 0.04099983721971512,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0022819519508630037,
      "rewards/logprob_reward/std": 0.0033365946728736162,
      "step": 1477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 731.0,
      "completions/mean_terminated_length": 711.4666748046875,
      "completions/min_length": 525.0,
      "completions/min_terminated_length": 525.0,
      "epoch": 4.561728395061729,
      "grad_norm": 2.6254814053042845,
      "kl": 0.3048095703125,
      "learning_rate": 1.015238118122011e-08,
      "loss": -0.1352,
      "num_tokens": 42370484.0,
      "reward": 0.04639824107289314,
      "reward_std": 0.05790696293115616,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0029424885287880898,
      "rewards/logprob_reward/std": 0.010873885825276375,
      "step": 1478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 729.90625,
      "completions/mean_terminated_length": 710.300048828125,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 4.564814814814815,
      "grad_norm": 2.143129440770457,
      "kl": 0.2186279296875,
      "learning_rate": 1.0011838431462389e-08,
      "loss": -0.2439,
      "num_tokens": 42400293.0,
      "reward": 0.049573834985494614,
      "reward_std": 0.05485403537750244,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0029987043235450983,
      "rewards/logprob_reward/std": 0.0052993991412222385,
      "step": 1479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 769.3125,
      "completions/mean_terminated_length": 722.1481323242188,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 4.567901234567901,
      "grad_norm": 2.194235872064849,
      "kl": 0.190185546875,
      "learning_rate": 9.872255385140027e-09,
      "loss": -0.2519,
      "num_tokens": 42431567.0,
      "reward": 0.03871108219027519,
      "reward_std": 0.04252662882208824,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0013456502929329872,
      "rewards/logprob_reward/std": 0.001864749239757657,
      "step": 1480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 705.5,
      "completions/mean_terminated_length": 695.2257690429688,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 4.570987654320987,
      "grad_norm": 1.7435056472001802,
      "kl": 0.247314453125,
      "learning_rate": 9.733632600440245e-09,
      "loss": -0.1419,
      "num_tokens": 42460563.0,
      "reward": 0.0358835905790329,
      "reward_std": 0.02217365801334381,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0016762100858613849,
      "rewards/logprob_reward/std": 0.002464568242430687,
      "step": 1481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 711.78125,
      "completions/mean_terminated_length": 667.1785888671875,
      "completions/min_length": 377.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 4.574074074074074,
      "grad_norm": 2.990968568617281,
      "kl": 0.2244873046875,
      "learning_rate": 9.595970631710248e-09,
      "loss": -0.3499,
      "num_tokens": 42489456.0,
      "reward": 0.028968021273612976,
      "reward_std": 0.029035020619630814,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0009366908925585449,
      "rewards/logprob_reward/std": 0.0016238440293818712,
      "step": 1482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 948.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 659.5625,
      "completions/mean_terminated_length": 659.5625,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 4.577160493827161,
      "grad_norm": 2.202255369039433,
      "kl": 0.24755859375,
      "learning_rate": 9.459270029454986e-09,
      "loss": -0.0597,
      "num_tokens": 42517030.0,
      "reward": 0.05343109369277954,
      "reward_std": 0.03393975645303726,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.003812328912317753,
      "rewards/logprob_reward/std": 0.004090317524969578,
      "step": 1483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 693.59375,
      "completions/mean_terminated_length": 682.9354858398438,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 4.580246913580247,
      "grad_norm": 2.18173859494406,
      "kl": 0.2265625,
      "learning_rate": 9.323531340334868e-09,
      "loss": -0.1217,
      "num_tokens": 42546053.0,
      "reward": 0.049290746450424194,
      "reward_std": 0.04943094775080681,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002684164559468627,
      "rewards/logprob_reward/std": 0.00423125596717,
      "step": 1484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 666.21875,
      "completions/mean_terminated_length": 654.6774291992188,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 4.583333333333333,
      "grad_norm": 2.067985335920416,
      "kl": 0.2274169921875,
      "learning_rate": 9.188755107163743e-09,
      "loss": -0.1437,
      "num_tokens": 42573588.0,
      "reward": 0.04903118312358856,
      "reward_std": 0.04188349097967148,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002395759802311659,
      "rewards/logprob_reward/std": 0.00443643145263195,
      "step": 1485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 823.0,
      "completions/mean_length": 688.34375,
      "completions/mean_terminated_length": 626.1851806640625,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 4.58641975308642,
      "grad_norm": 1.5822348621800693,
      "kl": 0.231689453125,
      "learning_rate": 9.054941868906513e-09,
      "loss": -0.0769,
      "num_tokens": 42601931.0,
      "reward": 0.0462818518280983,
      "reward_std": 0.03339197486639023,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002813167404383421,
      "rewards/logprob_reward/std": 0.007091212552040815,
      "step": 1486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 746.40625,
      "completions/mean_terminated_length": 727.9000244140625,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 4.589506172839506,
      "grad_norm": 2.276546124105462,
      "kl": 0.2197265625,
      "learning_rate": 8.922092160677242e-09,
      "loss": -0.2447,
      "num_tokens": 42632344.0,
      "reward": 0.055323585867881775,
      "reward_std": 0.051866527646780014,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002442875411361456,
      "rewards/logprob_reward/std": 0.0038433405570685863,
      "step": 1487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 673.15625,
      "completions/mean_terminated_length": 661.8386840820312,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 4.592592592592593,
      "grad_norm": 3.684538766166176,
      "kl": 0.2410888671875,
      "learning_rate": 8.79020651373677e-09,
      "loss": -0.2779,
      "num_tokens": 42660513.0,
      "reward": 0.06448155641555786,
      "reward_std": 0.04148316755890846,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002201724797487259,
      "rewards/logprob_reward/std": 0.00433498527854681,
      "step": 1488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 702.4375,
      "completions/mean_terminated_length": 669.1724243164062,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 4.595679012345679,
      "grad_norm": 2.1059833651414617,
      "kl": 0.235107421875,
      "learning_rate": 8.659285455490745e-09,
      "loss": 0.0114,
      "num_tokens": 42689363.0,
      "reward": 0.05539703741669655,
      "reward_std": 0.04169042780995369,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0025244837161153555,
      "rewards/logprob_reward/std": 0.0028246596921235323,
      "step": 1489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 908.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 666.46875,
      "completions/mean_terminated_length": 666.46875,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 4.598765432098766,
      "grad_norm": 3.542601292283675,
      "kl": 0.277587890625,
      "learning_rate": 8.529329509487455e-09,
      "loss": -0.2771,
      "num_tokens": 42717690.0,
      "reward": 0.04708962142467499,
      "reward_std": 0.041435856372117996,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0037106885574758053,
      "rewards/logprob_reward/std": 0.005584856029599905,
      "step": 1490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 762.96875,
      "completions/mean_terminated_length": 725.6785888671875,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 4.601851851851852,
      "grad_norm": 2.1466412930937917,
      "kl": 0.2491455078125,
      "learning_rate": 8.400339195415718e-09,
      "loss": -0.0973,
      "num_tokens": 42748813.0,
      "reward": 0.02977364882826805,
      "reward_std": 0.041639409959316254,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0018318291986361146,
      "rewards/logprob_reward/std": 0.0028566711116582155,
      "step": 1491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 947.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 662.3125,
      "completions/mean_terminated_length": 662.3125,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 4.604938271604938,
      "grad_norm": 1.937843807352332,
      "kl": 0.257568359375,
      "learning_rate": 8.272315029102888e-09,
      "loss": -0.2336,
      "num_tokens": 42776471.0,
      "reward": 0.07350407540798187,
      "reward_std": 0.04663775861263275,
      "rewards/format_reward_func/mean": 0.71875,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0018100799061357975,
      "rewards/logprob_reward/std": 0.0023184253368526697,
      "step": 1492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 725.375,
      "completions/mean_terminated_length": 705.4666748046875,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 4.6080246913580245,
      "grad_norm": 2.548582697170482,
      "kl": 0.2386474609375,
      "learning_rate": 8.145257522512606e-09,
      "loss": -0.2308,
      "num_tokens": 42806551.0,
      "reward": 0.03310152143239975,
      "reward_std": 0.048257678747177124,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0020572440698742867,
      "rewards/logprob_reward/std": 0.003052032319828868,
      "step": 1493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 727.65625,
      "completions/mean_terminated_length": 685.3214721679688,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.611111111111111,
      "grad_norm": 1.8255106970483308,
      "kl": 0.214599609375,
      "learning_rate": 8.019167183743041e-09,
      "loss": -0.1165,
      "num_tokens": 42836004.0,
      "reward": 0.04193819314241409,
      "reward_std": 0.05020206421613693,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0014591040089726448,
      "rewards/logprob_reward/std": 0.0024056523106992245,
      "step": 1494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 697.1875,
      "completions/mean_terminated_length": 675.4000244140625,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 4.614197530864198,
      "grad_norm": 1.8193479924266922,
      "kl": 0.2515869140625,
      "learning_rate": 7.89404451702455e-09,
      "loss": -0.0577,
      "num_tokens": 42864538.0,
      "reward": 0.04537242278456688,
      "reward_std": 0.034791670739650726,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0018026924226433039,
      "rewards/logprob_reward/std": 0.002865416230633855,
      "step": 1495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 710.9375,
      "completions/mean_terminated_length": 678.5516967773438,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 4.617283950617284,
      "grad_norm": 2.132767058374767,
      "kl": 0.2283935546875,
      "learning_rate": 7.769890022717884e-09,
      "loss": -0.2006,
      "num_tokens": 42893748.0,
      "reward": 0.04399431496858597,
      "reward_std": 0.03478696942329407,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.003743681125342846,
      "rewards/logprob_reward/std": 0.0048117870464921,
      "step": 1496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 714.6875,
      "completions/mean_terminated_length": 694.0667114257812,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 4.62037037037037,
      "grad_norm": 2.2418023665666853,
      "kl": 0.2222900390625,
      "learning_rate": 7.646704197312143e-09,
      "loss": -0.1643,
      "num_tokens": 42923290.0,
      "reward": 0.04454777017235756,
      "reward_std": 0.043588101863861084,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.004358632955700159,
      "rewards/logprob_reward/std": 0.006084616295993328,
      "step": 1497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 711.125,
      "completions/mean_terminated_length": 690.2667236328125,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 4.6234567901234565,
      "grad_norm": 3.553604400037332,
      "kl": 0.2537841796875,
      "learning_rate": 7.524487533422635e-09,
      "loss": -0.3947,
      "num_tokens": 42952566.0,
      "reward": 0.03583184629678726,
      "reward_std": 0.0420500822365284,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0016187188448384404,
      "rewards/logprob_reward/std": 0.0026539175305515528,
      "step": 1498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 755.0,
      "completions/mean_terminated_length": 746.3225708007812,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 4.6265432098765435,
      "grad_norm": 2.144779694803398,
      "kl": 0.2220458984375,
      "learning_rate": 7.403240519789161e-09,
      "loss": -0.1333,
      "num_tokens": 42983842.0,
      "reward": 0.04972454160451889,
      "reward_std": 0.035087473690509796,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0031661540269851685,
      "rewards/logprob_reward/std": 0.003860093653202057,
      "step": 1499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 732.59375,
      "completions/mean_terminated_length": 713.1666870117188,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 4.62962962962963,
      "grad_norm": 1.9280861009947392,
      "kl": 0.2061767578125,
      "learning_rate": 7.282963641273842e-09,
      "loss": -0.1129,
      "num_tokens": 43013929.0,
      "reward": 0.05114641785621643,
      "reward_std": 0.053203750401735306,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0012737988727167249,
      "rewards/logprob_reward/std": 0.002420877804979682,
      "step": 1500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 666.625,
      "completions/mean_terminated_length": 642.800048828125,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 4.632716049382716,
      "grad_norm": 3.0161430734232613,
      "kl": 0.231201171875,
      "learning_rate": 7.163657378859267e-09,
      "loss": -0.4704,
      "num_tokens": 43042165.0,
      "reward": 0.03871387615799904,
      "reward_std": 0.047514379024505615,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.001348750782199204,
      "rewards/logprob_reward/std": 0.002594175050035119,
      "step": 1501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 693.1875,
      "completions/mean_terminated_length": 682.51611328125,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 4.635802469135802,
      "grad_norm": 1.938330864772763,
      "kl": 0.2303466796875,
      "learning_rate": 7.045322209646654e-09,
      "loss": -0.2025,
      "num_tokens": 43070563.0,
      "reward": 0.06738285720348358,
      "reward_std": 0.027124961838126183,
      "rewards/format_reward_func/mean": 0.65625,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.001953174127265811,
      "rewards/logprob_reward/std": 0.0018129246309399605,
      "step": 1502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 651.78125,
      "completions/mean_terminated_length": 639.774169921875,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 4.638888888888889,
      "grad_norm": 2.135924664049642,
      "kl": 0.2335205078125,
      "learning_rate": 6.927958606853746e-09,
      "loss": -0.0142,
      "num_tokens": 43097824.0,
      "reward": 0.04533563554286957,
      "reward_std": 0.05339189991354942,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017618188867345452,
      "rewards/logprob_reward/std": 0.0021908129565417767,
      "step": 1503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 646.9375,
      "completions/mean_terminated_length": 634.774169921875,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 4.6419753086419755,
      "grad_norm": 2.320433802131267,
      "kl": 0.2471923828125,
      "learning_rate": 6.811567039813087e-09,
      "loss": -0.1788,
      "num_tokens": 43124434.0,
      "reward": 0.05148729309439659,
      "reward_std": 0.04719836264848709,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0016525487881153822,
      "rewards/logprob_reward/std": 0.0019936866592615843,
      "step": 1504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 744.78125,
      "completions/mean_terminated_length": 715.8965454101562,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 4.645061728395062,
      "grad_norm": 2.1228448380201153,
      "kl": 0.2081298828125,
      "learning_rate": 6.696147973970112e-09,
      "loss": -0.0033,
      "num_tokens": 43154599.0,
      "reward": 0.05481375381350517,
      "reward_std": 0.05460665374994278,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0018763924017548561,
      "rewards/logprob_reward/std": 0.0029246618505567312,
      "step": 1505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 719.65625,
      "completions/mean_terminated_length": 709.8386840820312,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 4.648148148148148,
      "grad_norm": 2.436848909203649,
      "kl": 0.2147216796875,
      "learning_rate": 6.581701870881196e-09,
      "loss": -0.3078,
      "num_tokens": 43183764.0,
      "reward": 0.051449697464704514,
      "reward_std": 0.046063363552093506,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0016107733827084303,
      "rewards/logprob_reward/std": 0.002504688920453191,
      "step": 1506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 662.0,
      "completions/mean_terminated_length": 650.3225708007812,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 4.651234567901234,
      "grad_norm": 2.014260519970987,
      "kl": 0.229248046875,
      "learning_rate": 6.4682291882119375e-09,
      "loss": -0.1049,
      "num_tokens": 43211264.0,
      "reward": 0.07040219008922577,
      "reward_std": 0.0403471514582634,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0018357643857598305,
      "rewards/logprob_reward/std": 0.0019348099594935775,
      "step": 1507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 676.71875,
      "completions/mean_terminated_length": 640.7930908203125,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 4.654320987654321,
      "grad_norm": 2.460559046510438,
      "kl": 0.26416015625,
      "learning_rate": 6.355730379735219e-09,
      "loss": -0.0752,
      "num_tokens": 43239091.0,
      "reward": 0.04869456589221954,
      "reward_std": 0.04262574017047882,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0020217373967170715,
      "rewards/logprob_reward/std": 0.005422488786280155,
      "step": 1508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 711.09375,
      "completions/mean_terminated_length": 666.3928833007812,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 4.657407407407407,
      "grad_norm": 2.293936676742903,
      "kl": 0.2314453125,
      "learning_rate": 6.244205895329452e-09,
      "loss": -0.0937,
      "num_tokens": 43268330.0,
      "reward": 0.06514693796634674,
      "reward_std": 0.04283153638243675,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002941043581813574,
      "rewards/logprob_reward/std": 0.002941833809018135,
      "step": 1509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 867.0,
      "completions/max_terminated_length": 867.0,
      "completions/mean_length": 656.0,
      "completions/mean_terminated_length": 656.0,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.660493827160494,
      "grad_norm": 2.3794356247810753,
      "kl": 0.2310791015625,
      "learning_rate": 6.133656180976776e-09,
      "loss": -0.2736,
      "num_tokens": 43296014.0,
      "reward": 0.04233257472515106,
      "reward_std": 0.027813980355858803,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.001897301059216261,
      "rewards/logprob_reward/std": 0.0024011824280023575,
      "step": 1510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 939.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 631.78125,
      "completions/mean_terminated_length": 631.78125,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 4.66358024691358,
      "grad_norm": 2.207114514188508,
      "kl": 0.2685546875,
      "learning_rate": 6.024081678761228e-09,
      "loss": -0.1798,
      "num_tokens": 43322323.0,
      "reward": 0.05467233061790466,
      "reward_std": 0.04698755964636803,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0017192568629980087,
      "rewards/logprob_reward/std": 0.002006774302572012,
      "step": 1511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 756.71875,
      "completions/mean_terminated_length": 681.8800048828125,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.666666666666667,
      "grad_norm": 4.018916638189679,
      "kl": 0.250244140625,
      "learning_rate": 5.915482826867047e-09,
      "loss": -0.4026,
      "num_tokens": 43353330.0,
      "reward": 0.048289455473423004,
      "reward_std": 0.04794459789991379,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001571614877320826,
      "rewards/logprob_reward/std": 0.004118291661143303,
      "step": 1512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 713.71875,
      "completions/mean_terminated_length": 693.0333862304688,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.669753086419753,
      "grad_norm": 2.683404102549931,
      "kl": 0.220703125,
      "learning_rate": 5.807860059576841e-09,
      "loss": -0.3136,
      "num_tokens": 43382557.0,
      "reward": 0.05862727761268616,
      "reward_std": 0.04001375660300255,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0026414182502776384,
      "rewards/logprob_reward/std": 0.005117638502269983,
      "step": 1513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 781.03125,
      "completions/mean_terminated_length": 755.8965454101562,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 4.672839506172839,
      "grad_norm": 2.159997404539417,
      "kl": 0.199462890625,
      "learning_rate": 5.701213807269956e-09,
      "loss": -0.2434,
      "num_tokens": 43413994.0,
      "reward": 0.055472537875175476,
      "reward_std": 0.04255915433168411,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0026083746924996376,
      "rewards/logprob_reward/std": 0.004415425937622786,
      "step": 1514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 707.9375,
      "completions/mean_terminated_length": 697.741943359375,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 4.675925925925926,
      "grad_norm": 2.621603568875648,
      "kl": 0.2293701171875,
      "learning_rate": 5.5955444964206345e-09,
      "loss": -0.2414,
      "num_tokens": 43442832.0,
      "reward": 0.05145654082298279,
      "reward_std": 0.03780600428581238,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.001618378795683384,
      "rewards/logprob_reward/std": 0.0022882563062012196,
      "step": 1515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 835.0,
      "completions/max_terminated_length": 835.0,
      "completions/mean_length": 689.8125,
      "completions/mean_terminated_length": 689.8125,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 4.679012345679013,
      "grad_norm": 1.8923983034097638,
      "kl": 0.225830078125,
      "learning_rate": 5.490852549596387e-09,
      "loss": -0.1754,
      "num_tokens": 43471962.0,
      "reward": 0.050171319395303726,
      "reward_std": 0.04255622625350952,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0036625780630856752,
      "rewards/logprob_reward/std": 0.004384476691484451,
      "step": 1516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 738.8125,
      "completions/mean_terminated_length": 686.0,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 4.682098765432099,
      "grad_norm": 1.8941827081878828,
      "kl": 0.2362060546875,
      "learning_rate": 5.387138385456319e-09,
      "loss": -0.0436,
      "num_tokens": 43501872.0,
      "reward": 0.05162039399147034,
      "reward_std": 0.02073848992586136,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.00180044153239578,
      "rewards/logprob_reward/std": 0.002051715040579438,
      "step": 1517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 758.46875,
      "completions/mean_terminated_length": 731.0,
      "completions/min_length": 553.0,
      "completions/min_terminated_length": 553.0,
      "epoch": 4.685185185185185,
      "grad_norm": 2.2133668662481414,
      "kl": 0.2122802734375,
      "learning_rate": 5.284402418749362e-09,
      "loss": -0.1702,
      "num_tokens": 43532927.0,
      "reward": 0.050726257264614105,
      "reward_std": 0.04197445139288902,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.004279176238924265,
      "rewards/logprob_reward/std": 0.007665267214179039,
      "step": 1518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 723.5,
      "completions/mean_terminated_length": 667.8518676757812,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 4.688271604938271,
      "grad_norm": 2.005056288520359,
      "kl": 0.23681640625,
      "learning_rate": 5.182645060312685e-09,
      "loss": -0.1067,
      "num_tokens": 43562927.0,
      "reward": 0.033513665199279785,
      "reward_std": 0.027011813595891,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0025151814334094524,
      "rewards/logprob_reward/std": 0.004920200444757938,
      "step": 1519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1010.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 690.34375,
      "completions/mean_terminated_length": 690.34375,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 4.6913580246913575,
      "grad_norm": 2.57913775796617,
      "kl": 0.2568359375,
      "learning_rate": 5.081866717070088e-09,
      "loss": -0.4261,
      "num_tokens": 43591182.0,
      "reward": 0.043917812407016754,
      "reward_std": 0.034200187772512436,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.003658680710941553,
      "rewards/logprob_reward/std": 0.004496648907661438,
      "step": 1520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 735.8125,
      "completions/mean_terminated_length": 694.6428833007812,
      "completions/min_length": 377.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 4.694444444444445,
      "grad_norm": 2.029400397554303,
      "kl": 0.214111328125,
      "learning_rate": 4.9820677920302534e-09,
      "loss": -0.1665,
      "num_tokens": 43621208.0,
      "reward": 0.06271371245384216,
      "reward_std": 0.03440491110086441,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.003709673648700118,
      "rewards/logprob_reward/std": 0.003139941254630685,
      "step": 1521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 671.53125,
      "completions/mean_terminated_length": 660.1612548828125,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 4.697530864197531,
      "grad_norm": 2.308131113431546,
      "kl": 0.2225341796875,
      "learning_rate": 4.883248684285302e-09,
      "loss": -0.2318,
      "num_tokens": 43649165.0,
      "reward": 0.05717732012271881,
      "reward_std": 0.05596880614757538,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.004502573981881142,
      "rewards/logprob_reward/std": 0.005215025972574949,
      "step": 1522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 754.375,
      "completions/mean_terminated_length": 736.4000244140625,
      "completions/min_length": 503.0,
      "completions/min_terminated_length": 503.0,
      "epoch": 4.700617283950617,
      "grad_norm": 1.9382971045463089,
      "kl": 0.2247314453125,
      "learning_rate": 4.785409789008988e-09,
      "loss": -0.3379,
      "num_tokens": 43679437.0,
      "reward": 0.03940431401133537,
      "reward_std": 0.04100026935338974,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.00211590644903481,
      "rewards/logprob_reward/std": 0.0024238715413957834,
      "step": 1523
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 754.84375,
      "completions/mean_terminated_length": 649.521728515625,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 4.703703703703704,
      "grad_norm": 3.7134581259618886,
      "kl": NaN,
      "learning_rate": 4.68855149745534e-09,
      "loss": -0.228,
      "num_tokens": 43710480.0,
      "reward": 0.03510964661836624,
      "reward_std": 0.02885759249329567,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0008162733865901828,
      "rewards/logprob_reward/std": 0.0019914295990020037,
      "step": 1524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 705.0625,
      "completions/mean_terminated_length": 672.0689697265625,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 4.70679012345679,
      "grad_norm": 3.5558333106059634,
      "kl": 0.234375,
      "learning_rate": 4.592674196956914e-09,
      "loss": -0.3241,
      "num_tokens": 43739658.0,
      "reward": 0.04232990741729736,
      "reward_std": 0.050770103931427,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.00189433922059834,
      "rewards/logprob_reward/std": 0.0025692505296319723,
      "step": 1525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 703.75,
      "completions/mean_terminated_length": 670.6206665039062,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 4.709876543209877,
      "grad_norm": 2.982549396593798,
      "kl": 0.2117919921875,
      "learning_rate": 4.497778270923374e-09,
      "loss": -0.3901,
      "num_tokens": 43768566.0,
      "reward": 0.05457794666290283,
      "reward_std": 0.04171760752797127,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0016143880784511566,
      "rewards/logprob_reward/std": 0.002547757001593709,
      "step": 1526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 691.34375,
      "completions/mean_terminated_length": 680.6128540039062,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 4.712962962962963,
      "grad_norm": 1.8374788501829256,
      "kl": 0.230712890625,
      "learning_rate": 4.403864098839833e-09,
      "loss": -0.0062,
      "num_tokens": 43796969.0,
      "reward": 0.05811849236488342,
      "reward_std": 0.03246118128299713,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.002076097298413515,
      "rewards/logprob_reward/std": 0.002303823595866561,
      "step": 1527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 689.71875,
      "completions/mean_terminated_length": 678.9354858398438,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 4.716049382716049,
      "grad_norm": 2.0907773916798154,
      "kl": 0.227783203125,
      "learning_rate": 4.31093205626551e-09,
      "loss": -0.1669,
      "num_tokens": 43825252.0,
      "reward": 0.061343710869550705,
      "reward_std": 0.05555129051208496,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0021874543745070696,
      "rewards/logprob_reward/std": 0.0020785327069461346,
      "step": 1528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 946.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 653.6875,
      "completions/mean_terminated_length": 653.6875,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 4.719135802469136,
      "grad_norm": 2.5871332535777176,
      "kl": 0.2325439453125,
      "learning_rate": 4.218982514832048e-09,
      "loss": -0.401,
      "num_tokens": 43852822.0,
      "reward": 0.04688587039709091,
      "reward_std": 0.0505446121096611,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003484301269054413,
      "rewards/logprob_reward/std": 0.004275031853467226,
      "step": 1529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 702.71875,
      "completions/mean_terminated_length": 692.3547973632812,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 4.722222222222222,
      "grad_norm": 2.9221650012489384,
      "kl": 0.2108154296875,
      "learning_rate": 4.128015842242122e-09,
      "loss": -0.2255,
      "num_tokens": 43881733.0,
      "reward": 0.04261261597275734,
      "reward_std": 0.04130998253822327,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002208464778959751,
      "rewards/logprob_reward/std": 0.00345027563162148,
      "step": 1530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 749.09375,
      "completions/mean_terminated_length": 720.6551513671875,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 4.7253086419753085,
      "grad_norm": 2.404028400713213,
      "kl": 0.2393798828125,
      "learning_rate": 4.0380324022679935e-09,
      "loss": -0.281,
      "num_tokens": 43913168.0,
      "reward": 0.049170732498168945,
      "reward_std": 0.051856983453035355,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.019911926239728928,
      "rewards/logprob_reward/std": 0.05080845579504967,
      "step": 1531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 737.53125,
      "completions/mean_terminated_length": 728.290283203125,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 4.728395061728395,
      "grad_norm": 2.0370504351113516,
      "kl": 0.20068359375,
      "learning_rate": 3.9490325547499316e-09,
      "loss": -0.0427,
      "num_tokens": 43943845.0,
      "reward": 0.04996287077665329,
      "reward_std": 0.04933923855423927,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0034309634938836098,
      "rewards/logprob_reward/std": 0.004405571613460779,
      "step": 1532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 744.25,
      "completions/mean_terminated_length": 715.3103637695312,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.731481481481482,
      "grad_norm": 2.2135796312301093,
      "kl": 0.2001953125,
      "learning_rate": 3.861016655594962e-09,
      "loss": -0.2235,
      "num_tokens": 43974121.0,
      "reward": 0.055599234998226166,
      "reward_std": 0.047450773417949677,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0027491520158946514,
      "rewards/logprob_reward/std": 0.00328139984048903,
      "step": 1533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 703.6875,
      "completions/mean_terminated_length": 693.3547973632812,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 4.734567901234568,
      "grad_norm": 1.9148919795249735,
      "kl": 0.2255859375,
      "learning_rate": 3.773985056775258e-09,
      "loss": -0.0338,
      "num_tokens": 44002847.0,
      "reward": 0.06723778694868088,
      "reward_std": 0.04859152436256409,
      "rewards/format_reward_func/mean": 0.65625,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0017919890815392137,
      "rewards/logprob_reward/std": 0.002055313903838396,
      "step": 1534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 686.34375,
      "completions/mean_terminated_length": 675.4515991210938,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 4.737654320987654,
      "grad_norm": 2.016759515633251,
      "kl": 0.2088623046875,
      "learning_rate": 3.68793810632681e-09,
      "loss": -0.2412,
      "num_tokens": 44031374.0,
      "reward": 0.054454416036605835,
      "reward_std": 0.050731875002384186,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0014771256828680634,
      "rewards/logprob_reward/std": 0.002718152478337288,
      "step": 1535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1007.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 724.0625,
      "completions/mean_terminated_length": 724.0625,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 4.7407407407407405,
      "grad_norm": 1.9265106398716043,
      "kl": 0.2135009765625,
      "learning_rate": 3.602876148348116e-09,
      "loss": -0.1655,
      "num_tokens": 44060856.0,
      "reward": 0.042583052068948746,
      "reward_std": 0.03934343159198761,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002175615169107914,
      "rewards/logprob_reward/std": 0.002967662876471877,
      "step": 1536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 657.6875,
      "completions/mean_terminated_length": 645.8709716796875,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 4.743827160493828,
      "grad_norm": 3.299285365438242,
      "kl": 0.226318359375,
      "learning_rate": 3.518799522998661e-09,
      "loss": -0.3395,
      "num_tokens": 44088018.0,
      "reward": 0.041676074266433716,
      "reward_std": 0.053768157958984375,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0011678591836243868,
      "rewards/logprob_reward/std": 0.0019146768609061837,
      "step": 1537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 776.40625,
      "completions/mean_terminated_length": 719.269287109375,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 4.746913580246914,
      "grad_norm": 1.6565600344802227,
      "kl": 0.20068359375,
      "learning_rate": 3.435708566497608e-09,
      "loss": -0.0962,
      "num_tokens": 44119195.0,
      "reward": 0.040542714297771454,
      "reward_std": 0.04417931288480759,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.003380796406418085,
      "rewards/logprob_reward/std": 0.006241412367671728,
      "step": 1538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 856.0,
      "completions/mean_length": 665.3125,
      "completions/mean_terminated_length": 641.4000244140625,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 4.75,
      "grad_norm": 1.7865661626285811,
      "kl": 0.258544921875,
      "learning_rate": 3.353603611122524e-09,
      "loss": -0.2299,
      "num_tokens": 44146489.0,
      "reward": 0.06413263827562332,
      "reward_std": 0.04776488244533539,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0018140419851988554,
      "rewards/logprob_reward/std": 0.0035549812018871307,
      "step": 1539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 665.34375,
      "completions/mean_terminated_length": 641.433349609375,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 4.753086419753086,
      "grad_norm": 2.079446226907431,
      "kl": 0.2318115234375,
      "learning_rate": 3.2724849852079628e-09,
      "loss": -0.1236,
      "num_tokens": 44173964.0,
      "reward": 0.044496528804302216,
      "reward_std": 0.04535555839538574,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0008294780272990465,
      "rewards/logprob_reward/std": 0.001505252905189991,
      "step": 1540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 725.4375,
      "completions/mean_terminated_length": 715.8064575195312,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 4.756172839506172,
      "grad_norm": 3.1618937626713195,
      "kl": 0.2412109375,
      "learning_rate": 3.192353013144189e-09,
      "loss": -0.2779,
      "num_tokens": 44203642.0,
      "reward": 0.038170669227838516,
      "reward_std": 0.03268398344516754,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0007451868150383234,
      "rewards/logprob_reward/std": 0.001686312723904848,
      "step": 1541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 721.03125,
      "completions/mean_terminated_length": 677.75,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 4.7592592592592595,
      "grad_norm": 2.050281713419281,
      "kl": 0.2061767578125,
      "learning_rate": 3.113208015375901e-09,
      "loss": -0.1335,
      "num_tokens": 44233011.0,
      "reward": 0.05213935300707817,
      "reward_std": 0.045283056795597076,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002377056982368231,
      "rewards/logprob_reward/std": 0.0033256742171943188,
      "step": 1542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 717.53125,
      "completions/mean_terminated_length": 697.1000366210938,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 4.762345679012346,
      "grad_norm": 1.7723029762016207,
      "kl": 0.2257080078125,
      "learning_rate": 3.0350503084008995e-09,
      "loss": -0.1376,
      "num_tokens": 44262740.0,
      "reward": 0.05031438544392586,
      "reward_std": 0.03975418582558632,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0038215392269194126,
      "rewards/logprob_reward/std": 0.006404296960681677,
      "step": 1543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 656.1875,
      "completions/mean_terminated_length": 644.3225708007812,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 4.765432098765432,
      "grad_norm": 1.778152137398992,
      "kl": 0.2470703125,
      "learning_rate": 2.957880204768809e-09,
      "loss": -0.0711,
      "num_tokens": 44289642.0,
      "reward": 0.05754018574953079,
      "reward_std": 0.03257685899734497,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001433537108823657,
      "rewards/logprob_reward/std": 0.0015094984555616975,
      "step": 1544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 757.375,
      "completions/mean_terminated_length": 719.2857666015625,
      "completions/min_length": 542.0,
      "completions/min_terminated_length": 542.0,
      "epoch": 4.768518518518518,
      "grad_norm": 2.3460314398604862,
      "kl": 0.220703125,
      "learning_rate": 2.8816980130799418e-09,
      "loss": -0.1334,
      "num_tokens": 44320898.0,
      "reward": 0.04554835706949234,
      "reward_std": 0.04604505002498627,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0019981749355793,
      "rewards/logprob_reward/std": 0.002213164698332548,
      "step": 1545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 635.8125,
      "completions/mean_terminated_length": 623.290283203125,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 4.771604938271605,
      "grad_norm": 1.8312174372739913,
      "kl": 0.2366943359375,
      "learning_rate": 2.806504037983992e-09,
      "loss": -0.0846,
      "num_tokens": 44347812.0,
      "reward": 0.05764756724238396,
      "reward_std": 0.04722445458173752,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0015528519870713353,
      "rewards/logprob_reward/std": 0.0020970383193343878,
      "step": 1546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 758.84375,
      "completions/mean_terminated_length": 731.413818359375,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 4.7746913580246915,
      "grad_norm": 2.053868627912756,
      "kl": 0.21142578125,
      "learning_rate": 2.7322985801787046e-09,
      "loss": -0.121,
      "num_tokens": 44378323.0,
      "reward": 0.04052641987800598,
      "reward_std": 0.041819311678409576,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.003362688235938549,
      "rewards/logprob_reward/std": 0.004958003293722868,
      "step": 1547
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 733.40625,
      "completions/mean_terminated_length": 714.0333862304688,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 4.777777777777778,
      "grad_norm": 1.865677782906322,
      "kl": NaN,
      "learning_rate": 2.6590819364088746e-09,
      "loss": -0.0866,
      "num_tokens": 44408708.0,
      "reward": 0.04511159658432007,
      "reward_std": 0.027660556137561798,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001512888353317976,
      "rewards/logprob_reward/std": 0.002074025571346283,
      "step": 1548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 715.5625,
      "completions/mean_terminated_length": 695.0000610351562,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.780864197530864,
      "grad_norm": 2.2663529037789614,
      "kl": 0.2218017578125,
      "learning_rate": 2.5868543994650993e-09,
      "loss": -0.1056,
      "num_tokens": 44437970.0,
      "reward": 0.05412521958351135,
      "reward_std": 0.0419880636036396,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.004583575762808323,
      "rewards/logprob_reward/std": 0.009514378383755684,
      "step": 1549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 705.40625,
      "completions/mean_terminated_length": 684.1666870117188,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 4.783950617283951,
      "grad_norm": 2.170932131963563,
      "kl": 0.2294921875,
      "learning_rate": 2.5156162581824736e-09,
      "loss": -0.1829,
      "num_tokens": 44467083.0,
      "reward": 0.05616780370473862,
      "reward_std": 0.04913990572094917,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0033808897715061903,
      "rewards/logprob_reward/std": 0.004154331050813198,
      "step": 1550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 761.8125,
      "completions/mean_terminated_length": 713.25927734375,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 4.787037037037037,
      "grad_norm": 1.9210242421969543,
      "kl": 0.200439453125,
      "learning_rate": 2.44536779743959e-09,
      "loss": -0.2208,
      "num_tokens": 44497877.0,
      "reward": 0.048709139227867126,
      "reward_std": 0.032009467482566833,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002037932863458991,
      "rewards/logprob_reward/std": 0.0029220767319202423,
      "step": 1551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 737.21875,
      "completions/mean_terminated_length": 696.2500610351562,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 4.790123456790123,
      "grad_norm": 1.993314091973738,
      "kl": 0.2431640625,
      "learning_rate": 2.376109298157347e-09,
      "loss": -0.1321,
      "num_tokens": 44528140.0,
      "reward": 0.059702493250370026,
      "reward_std": 0.03522733226418495,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0038360999897122383,
      "rewards/logprob_reward/std": 0.0049207513220608234,
      "step": 1552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 670.75,
      "completions/mean_terminated_length": 659.3547973632812,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 4.79320987654321,
      "grad_norm": 2.3662139425006297,
      "kl": 0.244873046875,
      "learning_rate": 2.3078410372978084e-09,
      "loss": -0.1522,
      "num_tokens": 44555908.0,
      "reward": 0.05789976567029953,
      "reward_std": 0.04513705521821976,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0018330684397369623,
      "rewards/logprob_reward/std": 0.002843984169885516,
      "step": 1553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 710.8125,
      "completions/mean_terminated_length": 678.413818359375,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 4.796296296296296,
      "grad_norm": 2.1918860901621886,
      "kl": 0.245361328125,
      "learning_rate": 2.240563287863151e-09,
      "loss": -0.2624,
      "num_tokens": 44585338.0,
      "reward": 0.052353691309690475,
      "reward_std": 0.05389145761728287,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0026152110658586025,
      "rewards/logprob_reward/std": 0.0035162579733878374,
      "step": 1554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 709.0625,
      "completions/mean_terminated_length": 698.9031982421875,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 4.799382716049383,
      "grad_norm": 1.8584598367804426,
      "kl": 0.2265625,
      "learning_rate": 2.174276318894497e-09,
      "loss": 0.0024,
      "num_tokens": 44614944.0,
      "reward": 0.048598550260066986,
      "reward_std": 0.04688963294029236,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.001915052765980363,
      "rewards/logprob_reward/std": 0.002664490370079875,
      "step": 1555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 983.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 704.53125,
      "completions/mean_terminated_length": 704.53125,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 4.802469135802469,
      "grad_norm": 2.0321359065426527,
      "kl": 0.2298583984375,
      "learning_rate": 2.1089803954708884e-09,
      "loss": -0.0483,
      "num_tokens": 44644045.0,
      "reward": 0.0562812015414238,
      "reward_std": 0.04687017574906349,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003506890032440424,
      "rewards/logprob_reward/std": 0.0037236586213111877,
      "step": 1556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 704.3125,
      "completions/mean_terminated_length": 694.0,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 4.805555555555555,
      "grad_norm": 2.496444769411076,
      "kl": 0.2139892578125,
      "learning_rate": 2.0446757787082324e-09,
      "loss": -0.3015,
      "num_tokens": 44672883.0,
      "reward": 0.03860919177532196,
      "reward_std": 0.045600421726703644,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0012324335984885693,
      "rewards/logprob_reward/std": 0.0019448957173153758,
      "step": 1557
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 671.78125,
      "completions/mean_terminated_length": 635.3448486328125,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 4.8086419753086425,
      "grad_norm": 1.736623584195244,
      "kl": NaN,
      "learning_rate": 1.98136272575819e-09,
      "loss": -0.1128,
      "num_tokens": 44700868.0,
      "reward": 0.061923883855342865,
      "reward_std": 0.03648325800895691,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.002832094905897975,
      "rewards/logprob_reward/std": 0.00374964764341712,
      "step": 1558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 701.28125,
      "completions/mean_terminated_length": 679.7667236328125,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 4.811728395061729,
      "grad_norm": 1.956746107073524,
      "kl": 0.227294921875,
      "learning_rate": 1.919041489807233e-09,
      "loss": -0.0294,
      "num_tokens": 44729313.0,
      "reward": 0.06404221057891846,
      "reward_std": 0.04741805046796799,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0017135670641437173,
      "rewards/logprob_reward/std": 0.002254784805700183,
      "step": 1559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1018.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 703.375,
      "completions/mean_terminated_length": 703.375,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 4.814814814814815,
      "grad_norm": 2.025904015910776,
      "kl": 0.22314453125,
      "learning_rate": 1.857712320075616e-09,
      "loss": -0.1686,
      "num_tokens": 44758361.0,
      "reward": 0.05339666083455086,
      "reward_std": 0.04650755599141121,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0037740650586783886,
      "rewards/logprob_reward/std": 0.005097648594528437,
      "step": 1560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 726.125,
      "completions/mean_terminated_length": 706.2667236328125,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 4.817901234567901,
      "grad_norm": 2.485263575089205,
      "kl": 0.2247314453125,
      "learning_rate": 1.7973754618162972e-09,
      "loss": -0.2606,
      "num_tokens": 44788377.0,
      "reward": 0.04539719969034195,
      "reward_std": 0.04811866953969002,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001830223249271512,
      "rewards/logprob_reward/std": 0.0031537904869765043,
      "step": 1561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 664.09375,
      "completions/mean_terminated_length": 626.862060546875,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.820987654320987,
      "grad_norm": 2.3784729224611487,
      "kl": 0.2540283203125,
      "learning_rate": 1.7380311563140737e-09,
      "loss": -0.1922,
      "num_tokens": 44815652.0,
      "reward": 0.04746240749955177,
      "reward_std": 0.04821339249610901,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0006526728975586593,
      "rewards/logprob_reward/std": 0.0009571689297445118,
      "step": 1562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 742.8125,
      "completions/mean_terminated_length": 702.6428833007812,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 4.824074074074074,
      "grad_norm": 2.075657716941629,
      "kl": 0.2567138671875,
      "learning_rate": 1.6796796408845292e-09,
      "loss": -0.121,
      "num_tokens": 44846126.0,
      "reward": 0.04571472108364105,
      "reward_std": 0.04788792133331299,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0021830215118825436,
      "rewards/logprob_reward/std": 0.0033464357256889343,
      "step": 1563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 725.96875,
      "completions/mean_terminated_length": 706.1000366210938,
      "completions/min_length": 542.0,
      "completions/min_terminated_length": 542.0,
      "epoch": 4.827160493827161,
      "grad_norm": 2.018749742959247,
      "kl": 0.24560546875,
      "learning_rate": 1.622321148873146e-09,
      "loss": -0.0439,
      "num_tokens": 44875653.0,
      "reward": 0.04587321728467941,
      "reward_std": 0.05631254240870476,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0023591280914843082,
      "rewards/logprob_reward/std": 0.003474861616268754,
      "step": 1564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 709.53125,
      "completions/mean_terminated_length": 688.5667114257812,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 4.830246913580247,
      "grad_norm": 2.0734281347079837,
      "kl": 0.2442626953125,
      "learning_rate": 1.5659559096543318e-09,
      "loss": -0.1977,
      "num_tokens": 44905174.0,
      "reward": 0.050982531160116196,
      "reward_std": 0.04027026891708374,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.004563921596854925,
      "rewards/logprob_reward/std": 0.006054244004189968,
      "step": 1565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 672.625,
      "completions/mean_terminated_length": 661.290283203125,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 4.833333333333333,
      "grad_norm": 1.822317558433637,
      "kl": 0.21923828125,
      "learning_rate": 1.5105841486304783e-09,
      "loss": -0.0985,
      "num_tokens": 44932874.0,
      "reward": 0.057907603681087494,
      "reward_std": 0.035353753715753555,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.001841781078837812,
      "rewards/logprob_reward/std": 0.0017611915245652199,
      "step": 1566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 681.09375,
      "completions/mean_terminated_length": 658.2333374023438,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 4.83641975308642,
      "grad_norm": 2.123560453204114,
      "kl": 0.2528076171875,
      "learning_rate": 1.456206087231182e-09,
      "loss": -0.2365,
      "num_tokens": 44960993.0,
      "reward": 0.05193224549293518,
      "reward_std": 0.045639969408512115,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002146940678358078,
      "rewards/logprob_reward/std": 0.0024786905851215124,
      "step": 1567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 735.25,
      "completions/mean_terminated_length": 716.0000610351562,
      "completions/min_length": 551.0,
      "completions/min_terminated_length": 551.0,
      "epoch": 4.839506172839506,
      "grad_norm": 2.050238233273911,
      "kl": 0.22021484375,
      "learning_rate": 1.4028219429121912e-09,
      "loss": -0.135,
      "num_tokens": 44990989.0,
      "reward": 0.048916611820459366,
      "reward_std": 0.05314203351736069,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002268455922603607,
      "rewards/logprob_reward/std": 0.0028817281126976013,
      "step": 1568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 997.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 709.375,
      "completions/mean_terminated_length": 709.375,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 4.842592592592593,
      "grad_norm": 2.1466709104865003,
      "kl": 0.2158203125,
      "learning_rate": 1.350431929154655e-09,
      "loss": -0.0916,
      "num_tokens": 45021045.0,
      "reward": 0.05392808839678764,
      "reward_std": 0.04955313727259636,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.004364541731774807,
      "rewards/logprob_reward/std": 0.005988817662000656,
      "step": 1569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 965.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 701.34375,
      "completions/mean_terminated_length": 701.34375,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 4.845679012345679,
      "grad_norm": 2.3927513095530353,
      "kl": 0.4453125,
      "learning_rate": 1.2990362554642087e-09,
      "loss": -0.1533,
      "num_tokens": 45050400.0,
      "reward": 0.05046911537647247,
      "reward_std": 0.05384611338376999,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.003993459045886993,
      "rewards/logprob_reward/std": 0.004564306698739529,
      "step": 1570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 710.90625,
      "completions/mean_terminated_length": 678.5172119140625,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 4.848765432098766,
      "grad_norm": 2.1825906101881607,
      "kl": 0.2135009765625,
      "learning_rate": 1.2486351273701678e-09,
      "loss": -0.0554,
      "num_tokens": 45079781.0,
      "reward": 0.0676162913441658,
      "reward_std": 0.047531113028526306,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.005684769246727228,
      "rewards/logprob_reward/std": 0.009872187860310078,
      "step": 1571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 737.5,
      "completions/mean_terminated_length": 671.3846435546875,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 4.851851851851852,
      "grad_norm": 1.7570850946419518,
      "kl": 0.23876953125,
      "learning_rate": 1.199228746424752e-09,
      "loss": 0.0072,
      "num_tokens": 45109933.0,
      "reward": 0.0463409498333931,
      "reward_std": 0.028527850285172462,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0028788335621356964,
      "rewards/logprob_reward/std": 0.00331366085447371,
      "step": 1572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1016.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 734.09375,
      "completions/mean_terminated_length": 734.09375,
      "completions/min_length": 554.0,
      "completions/min_terminated_length": 554.0,
      "epoch": 4.854938271604938,
      "grad_norm": 1.9395569993524355,
      "kl": 0.208740234375,
      "learning_rate": 1.1508173102021402e-09,
      "loss": -0.1806,
      "num_tokens": 45140212.0,
      "reward": 0.04876071214675903,
      "reward_std": 0.04676129296422005,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0020952331833541393,
      "rewards/logprob_reward/std": 0.002500335918739438,
      "step": 1573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1006.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 658.71875,
      "completions/mean_terminated_length": 658.71875,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 4.8580246913580245,
      "grad_norm": 1.7223103094456633,
      "kl": 0.2281494140625,
      "learning_rate": 1.1034010122978332e-09,
      "loss": -0.1822,
      "num_tokens": 45167359.0,
      "reward": 0.06386316567659378,
      "reward_std": 0.03483687713742256,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0015146301593631506,
      "rewards/logprob_reward/std": 0.00169938278850168,
      "step": 1574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 731.0625,
      "completions/mean_terminated_length": 676.8148193359375,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 4.861111111111111,
      "grad_norm": 1.8032321069261932,
      "kl": 0.2025146484375,
      "learning_rate": 1.0569800423277652e-09,
      "loss": -0.0998,
      "num_tokens": 45197257.0,
      "reward": 0.04143955558538437,
      "reward_std": 0.02968217059969902,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.007849502377212048,
      "rewards/logprob_reward/std": 0.017946681007742882,
      "step": 1575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 677.65625,
      "completions/mean_terminated_length": 666.4838256835938,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.864197530864198,
      "grad_norm": 2.022822446915842,
      "kl": 0.3104248046875,
      "learning_rate": 1.0115545859276098e-09,
      "loss": -0.2381,
      "num_tokens": 45225094.0,
      "reward": 0.0643848329782486,
      "reward_std": 0.05549953877925873,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0020942618139088154,
      "rewards/logprob_reward/std": 0.002886283677071333,
      "step": 1576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 748.34375,
      "completions/mean_terminated_length": 708.9642944335938,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 4.867283950617284,
      "grad_norm": 2.4189487363698916,
      "kl": 0.24072265625,
      "learning_rate": 9.67124824752058e-10,
      "loss": -0.1831,
      "num_tokens": 45255381.0,
      "reward": 0.030485866591334343,
      "reward_std": 0.04298137128353119,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0026231841184198856,
      "rewards/logprob_reward/std": 0.005127353128045797,
      "step": 1577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 679.5,
      "completions/mean_terminated_length": 643.862060546875,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 4.87037037037037,
      "grad_norm": 2.132278591164677,
      "kl": 0.2337646484375,
      "learning_rate": 9.236909364739587e-10,
      "loss": -0.1948,
      "num_tokens": 45283397.0,
      "reward": 0.05180853605270386,
      "reward_std": 0.04013790562748909,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0020094849169254303,
      "rewards/logprob_reward/std": 0.0025368938222527504,
      "step": 1578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 671.15625,
      "completions/mean_terminated_length": 659.774169921875,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 4.8734567901234565,
      "grad_norm": 2.207935082593846,
      "kl": 0.2347412109375,
      "learning_rate": 8.812530947837904e-10,
      "loss": -0.1477,
      "num_tokens": 45311434.0,
      "reward": 0.05283327028155327,
      "reward_std": 0.0455731637775898,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.003148077055811882,
      "rewards/logprob_reward/std": 0.004595972131937742,
      "step": 1579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 946.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 666.0,
      "completions/mean_terminated_length": 666.0,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 4.8765432098765435,
      "grad_norm": 1.9307497057717353,
      "kl": 0.25244140625,
      "learning_rate": 8.39811469388857e-10,
      "loss": -0.123,
      "num_tokens": 45339170.0,
      "reward": 0.06806036829948425,
      "reward_std": 0.053993478417396545,
      "rewards/format_reward_func/mean": 0.65625,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0027059619314968586,
      "rewards/logprob_reward/std": 0.003276693867519498,
      "step": 1580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 729.8125,
      "completions/mean_terminated_length": 710.2000122070312,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 4.87962962962963,
      "grad_norm": 1.920348156533404,
      "kl": 0.221435546875,
      "learning_rate": 7.99366226012621e-10,
      "loss": -0.0808,
      "num_tokens": 45369232.0,
      "reward": 0.0456579253077507,
      "reward_std": 0.04085057973861694,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0021199120674282312,
      "rewards/logprob_reward/std": 0.0024745571427047253,
      "step": 1581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 939.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 620.65625,
      "completions/mean_terminated_length": 620.65625,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 4.882716049382716,
      "grad_norm": 2.190429693186302,
      "kl": 0.2633056640625,
      "learning_rate": 7.59917526394066e-10,
      "loss": -0.089,
      "num_tokens": 45395173.0,
      "reward": 0.05531671270728111,
      "reward_std": 0.04867561534047127,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0024352334439754486,
      "rewards/logprob_reward/std": 0.002892302582040429,
      "step": 1582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 762.34375,
      "completions/mean_terminated_length": 713.888916015625,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 4.885802469135802,
      "grad_norm": 3.064749837510519,
      "kl": 0.3494873046875,
      "learning_rate": 7.214655282870019e-10,
      "loss": -0.4241,
      "num_tokens": 45426068.0,
      "reward": 0.03897803649306297,
      "reward_std": 0.04806748032569885,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0016422626795247197,
      "rewards/logprob_reward/std": 0.003233575262129307,
      "step": 1583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 698.25,
      "completions/mean_terminated_length": 664.5516967773438,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 4.888888888888889,
      "grad_norm": 2.5170887270911484,
      "kl": 0.2440185546875,
      "learning_rate": 6.840103854595103e-10,
      "loss": -0.373,
      "num_tokens": 45455060.0,
      "reward": 0.04967810958623886,
      "reward_std": 0.0429002121090889,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0031145629473030567,
      "rewards/logprob_reward/std": 0.003188327420502901,
      "step": 1584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 867.0,
      "completions/mean_length": 670.28125,
      "completions/mean_terminated_length": 658.8709716796875,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 4.8919753086419755,
      "grad_norm": 2.473694955379459,
      "kl": 0.2425537109375,
      "learning_rate": 6.475522476932504e-10,
      "loss": -0.159,
      "num_tokens": 45482785.0,
      "reward": 0.02962297573685646,
      "reward_std": 0.05181487649679184,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.0016644163988530636,
      "rewards/logprob_reward/std": 0.0021899265702813864,
      "step": 1585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 726.4375,
      "completions/mean_terminated_length": 695.6551513671875,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 4.895061728395062,
      "grad_norm": 1.7364290674636216,
      "kl": 0.25,
      "learning_rate": 6.120912607829598e-10,
      "loss": -0.0823,
      "num_tokens": 45512679.0,
      "reward": 0.05661670118570328,
      "reward_std": 0.04919694364070892,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0038796698208898306,
      "rewards/logprob_reward/std": 0.006622091867029667,
      "step": 1586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 726.59375,
      "completions/mean_terminated_length": 684.107177734375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 4.898148148148148,
      "grad_norm": 2.2950158286306257,
      "kl": 0.250244140625,
      "learning_rate": 5.776275665357045e-10,
      "loss": -0.2739,
      "num_tokens": 45542862.0,
      "reward": 0.0551067590713501,
      "reward_std": 0.054269008338451385,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0022019576281309128,
      "rewards/logprob_reward/std": 0.0028854061383754015,
      "step": 1587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 681.65625,
      "completions/mean_terminated_length": 670.6128540039062,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 4.901234567901234,
      "grad_norm": 2.6068100350830052,
      "kl": 0.22119140625,
      "learning_rate": 5.441613027704905e-10,
      "loss": -0.2717,
      "num_tokens": 45570955.0,
      "reward": 0.0395234078168869,
      "reward_std": 0.05497060716152191,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002248229691758752,
      "rewards/logprob_reward/std": 0.002661328064277768,
      "step": 1588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 727.8125,
      "completions/mean_terminated_length": 708.0667114257812,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 4.904320987654321,
      "grad_norm": 1.9920277043367673,
      "kl": 0.2139892578125,
      "learning_rate": 5.116926033176261e-10,
      "loss": -0.0148,
      "num_tokens": 45600705.0,
      "reward": 0.06153532490134239,
      "reward_std": 0.046502865850925446,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0024003551807254553,
      "rewards/logprob_reward/std": 0.002641119994223118,
      "step": 1589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 761.09375,
      "completions/mean_terminated_length": 752.6128540039062,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 4.907407407407407,
      "grad_norm": 3.0123446210350835,
      "kl": 0.2215576171875,
      "learning_rate": 4.802215980182212e-10,
      "loss": -0.3666,
      "num_tokens": 45631872.0,
      "reward": 0.05237887427210808,
      "reward_std": 0.04788391292095184,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.002643193816766143,
      "rewards/logprob_reward/std": 0.0025566760450601578,
      "step": 1590
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 712.5,
      "completions/mean_terminated_length": 702.4515991210938,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 4.910493827160494,
      "grad_norm": 1.7712953033443068,
      "kl": NaN,
      "learning_rate": 4.4974841272357734e-10,
      "loss": -0.1552,
      "num_tokens": 45661572.0,
      "reward": 0.05617368221282959,
      "reward_std": 0.044310979545116425,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0033874278888106346,
      "rewards/logprob_reward/std": 0.005647736601531506,
      "step": 1591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 882.0,
      "completions/mean_length": 686.78125,
      "completions/mean_terminated_length": 675.9031982421875,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 4.91358024691358,
      "grad_norm": 2.0125235515929623,
      "kl": 0.2332763671875,
      "learning_rate": 4.2027316929479916e-10,
      "loss": -0.1436,
      "num_tokens": 45689749.0,
      "reward": 0.07134392112493515,
      "reward_std": 0.038457900285720825,
      "rewards/format_reward_func/mean": 0.6875,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0028821297455579042,
      "rewards/logprob_reward/std": 0.003696457017213106,
      "step": 1592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 750.6875,
      "completions/mean_terminated_length": 732.4666748046875,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 4.916666666666667,
      "grad_norm": 1.983940238140831,
      "kl": 0.206787109375,
      "learning_rate": 3.917959856022668e-10,
      "loss": -0.2454,
      "num_tokens": 45720159.0,
      "reward": 0.043411463499069214,
      "reward_std": 0.05083286762237549,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.00309606920927763,
      "rewards/logprob_reward/std": 0.004363714717328548,
      "step": 1593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 995.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 670.4375,
      "completions/mean_terminated_length": 670.4375,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 4.919753086419753,
      "grad_norm": 2.366956041561088,
      "kl": 0.2435302734375,
      "learning_rate": 3.6431697552510853e-10,
      "loss": -0.3542,
      "num_tokens": 45747777.0,
      "reward": 0.05452614277601242,
      "reward_std": 0.04330441728234291,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0015568279195576906,
      "rewards/logprob_reward/std": 0.0021392430644482374,
      "step": 1594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 628.28125,
      "completions/mean_terminated_length": 601.9000244140625,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 4.922839506172839,
      "grad_norm": 3.0493038641598247,
      "kl": 0.25439453125,
      "learning_rate": 3.3783624895086795e-10,
      "loss": -0.252,
      "num_tokens": 45774074.0,
      "reward": 0.0488990917801857,
      "reward_std": 0.039302758872509,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0022489873226732016,
      "rewards/logprob_reward/std": 0.0034289683680981398,
      "step": 1595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 732.84375,
      "completions/mean_terminated_length": 691.2500610351562,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 4.925925925925926,
      "grad_norm": 1.9956180922077569,
      "kl": 0.226318359375,
      "learning_rate": 3.123539117749485e-10,
      "loss": -0.3358,
      "num_tokens": 45803733.0,
      "reward": 0.045016758143901825,
      "reward_std": 0.04055551066994667,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.00140750827267766,
      "rewards/logprob_reward/std": 0.0021567540243268013,
      "step": 1596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1015.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 656.28125,
      "completions/mean_terminated_length": 656.28125,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 4.929012345679013,
      "grad_norm": 1.9060611947842503,
      "kl": 0.2230224609375,
      "learning_rate": 2.8787006590022535e-10,
      "loss": -0.0451,
      "num_tokens": 45831046.0,
      "reward": 0.062060076743364334,
      "reward_std": 0.033758118748664856,
      "rewards/format_reward_func/mean": 0.59375,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0029834159649908543,
      "rewards/logprob_reward/std": 0.0034631541930139065,
      "step": 1597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 770.21875,
      "completions/mean_terminated_length": 753.300048828125,
      "completions/min_length": 547.0,
      "completions/min_terminated_length": 547.0,
      "epoch": 4.932098765432099,
      "grad_norm": 2.74209572330302,
      "kl": 0.2210693359375,
      "learning_rate": 2.6438480923665627e-10,
      "loss": -0.3143,
      "num_tokens": 45862649.0,
      "reward": 0.037834376096725464,
      "reward_std": 0.03432668745517731,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0038437512703239918,
      "rewards/logprob_reward/std": 0.005583377555012703,
      "step": 1598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1013.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 729.9375,
      "completions/mean_terminated_length": 729.9375,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 4.935185185185185,
      "grad_norm": 2.2021310832433216,
      "kl": 0.203857421875,
      "learning_rate": 2.418982357008936e-10,
      "loss": -0.2982,
      "num_tokens": 45892687.0,
      "reward": 0.059219736605882645,
      "reward_std": 0.04891116917133331,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.003299708478152752,
      "rewards/logprob_reward/std": 0.00543432030826807,
      "step": 1599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 676.90625,
      "completions/mean_terminated_length": 653.7667236328125,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 4.938271604938271,
      "grad_norm": 2.350324710579544,
      "kl": 0.2523193359375,
      "learning_rate": 2.2041043521586756e-10,
      "loss": -0.3098,
      "num_tokens": 45920724.0,
      "reward": 0.057065851986408234,
      "reward_std": 0.0470709502696991,
      "rewards/format_reward_func/mean": 0.5625,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0009064996265806258,
      "rewards/logprob_reward/std": 0.0013932627625763416,
      "step": 1600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 731.6875,
      "completions/mean_terminated_length": 701.4483032226562,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 4.9413580246913575,
      "grad_norm": 2.3862283050818807,
      "kl": 0.215576171875,
      "learning_rate": 1.999214937104532e-10,
      "loss": -0.0814,
      "num_tokens": 45950466.0,
      "reward": 0.04002302512526512,
      "reward_std": 0.053182486444711685,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0028033629059791565,
      "rewards/logprob_reward/std": 0.004257589112967253,
      "step": 1601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 747.53125,
      "completions/mean_terminated_length": 670.1199951171875,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 4.944444444444445,
      "grad_norm": 1.89502426343952,
      "kl": 0.2392578125,
      "learning_rate": 1.8043149311916529e-10,
      "loss": -0.1025,
      "num_tokens": 45981219.0,
      "reward": 0.04924574866890907,
      "reward_std": 0.047872144728899,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002634164411574602,
      "rewards/logprob_reward/std": 0.004099159501492977,
      "step": 1602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 704.90625,
      "completions/mean_terminated_length": 694.6128540039062,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 4.947530864197531,
      "grad_norm": 1.967147765334899,
      "kl": 0.242919921875,
      "learning_rate": 1.6194051138176955e-10,
      "loss": -0.103,
      "num_tokens": 46010308.0,
      "reward": 0.05199815705418587,
      "reward_std": 0.034758321940898895,
      "rewards/format_reward_func/mean": 0.5,
      "rewards/format_reward_func/std": 0.5080004930496216,
      "rewards/logprob_reward/mean": 0.0022201724350452423,
      "rewards/logprob_reward/std": 0.0031493811402469873,
      "step": 1603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 773.9375,
      "completions/mean_terminated_length": 716.2307739257812,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 4.950617283950617,
      "grad_norm": 2.121344620306122,
      "kl": 0.203857421875,
      "learning_rate": 1.444486224429775e-10,
      "loss": -0.0935,
      "num_tokens": 46041862.0,
      "reward": 0.033243902027606964,
      "reward_std": 0.0474444180727005,
      "rewards/format_reward_func/mean": 0.3125,
      "rewards/format_reward_func/std": 0.4709290862083435,
      "rewards/logprob_reward/mean": 0.0022154483012855053,
      "rewards/logprob_reward/std": 0.0028852999676018953,
      "step": 1604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 688.09375,
      "completions/mean_terminated_length": 665.7000122070312,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.953703703703704,
      "grad_norm": 1.8824868639397032,
      "kl": 0.243408203125,
      "learning_rate": 1.2795589625216875e-10,
      "loss": -0.0987,
      "num_tokens": 46070293.0,
      "reward": 0.03957396745681763,
      "reward_std": 0.04261380434036255,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.002304406836628914,
      "rewards/logprob_reward/std": 0.003390522440895438,
      "step": 1605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 736.75,
      "completions/mean_terminated_length": 695.7142944335938,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 4.95679012345679,
      "grad_norm": 2.083293342426221,
      "kl": 0.2130126953125,
      "learning_rate": 1.1246239876316899e-10,
      "loss": -0.1156,
      "num_tokens": 46100737.0,
      "reward": 0.037211790680885315,
      "reward_std": 0.02014215663075447,
      "rewards/format_reward_func/mean": 0.28125,
      "rewards/format_reward_func/std": 0.45680341124534607,
      "rewards/logprob_reward/mean": 0.010096433572471142,
      "rewards/logprob_reward/std": 0.027432050555944443,
      "step": 1606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 707.59375,
      "completions/mean_terminated_length": 697.3870849609375,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 4.959876543209877,
      "grad_norm": 2.4448589061778687,
      "kl": 0.24462890625,
      "learning_rate": 9.796819193383376e-11,
      "loss": -0.2613,
      "num_tokens": 46129972.0,
      "reward": 0.04535143822431564,
      "reward_std": 0.04745528846979141,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017793738516047597,
      "rewards/logprob_reward/std": 0.0021039326675236225,
      "step": 1607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 721.21875,
      "completions/mean_terminated_length": 677.9642944335938,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 4.962962962962963,
      "grad_norm": 2.1682792903217996,
      "kl": 0.228515625,
      "learning_rate": 8.447333372593735e-11,
      "loss": -0.2039,
      "num_tokens": 46159891.0,
      "reward": 0.04062517732381821,
      "reward_std": 0.04160744324326515,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0034724189899861813,
      "rewards/logprob_reward/std": 0.006191283464431763,
      "step": 1608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1012.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 709.21875,
      "completions/mean_terminated_length": 709.21875,
      "completions/min_length": 519.0,
      "completions/min_terminated_length": 519.0,
      "epoch": 4.966049382716049,
      "grad_norm": 1.7180152269203575,
      "kl": 0.2156982421875,
      "learning_rate": 7.197787810492295e-11,
      "loss": -0.095,
      "num_tokens": 46189178.0,
      "reward": 0.04858778417110443,
      "reward_std": 0.04863159358501434,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0019030930707231164,
      "rewards/logprob_reward/std": 0.0023480455856770277,
      "step": 1609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 696.65625,
      "completions/mean_terminated_length": 686.0967407226562,
      "completions/min_length": 519.0,
      "completions/min_terminated_length": 519.0,
      "epoch": 4.969135802469136,
      "grad_norm": 2.0715313841857736,
      "kl": 0.23046875,
      "learning_rate": 6.04818750396252e-11,
      "loss": -0.0434,
      "num_tokens": 46218183.0,
      "reward": 0.051609545946121216,
      "reward_std": 0.05690581351518631,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.005260605365037918,
      "rewards/logprob_reward/std": 0.0065270280465483665,
      "step": 1610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 725.46875,
      "completions/mean_terminated_length": 682.8214721679688,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.972222222222222,
      "grad_norm": 2.1515640106787317,
      "kl": 0.2322998046875,
      "learning_rate": 4.9985370502131366e-11,
      "loss": -0.2735,
      "num_tokens": 46247914.0,
      "reward": 0.042196281254291534,
      "reward_std": 0.04903978109359741,
      "rewards/format_reward_func/mean": 0.40625,
      "rewards/format_reward_func/std": 0.49899089336395264,
      "rewards/logprob_reward/mean": 0.0017458668444305658,
      "rewards/logprob_reward/std": 0.003071760293096304,
      "step": 1611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 770.8125,
      "completions/mean_terminated_length": 753.933349609375,
      "completions/min_length": 537.0,
      "completions/min_terminated_length": 537.0,
      "epoch": 4.9753086419753085,
      "grad_norm": 2.0670932493705667,
      "kl": 0.1849365234375,
      "learning_rate": 4.0488406467559245e-11,
      "loss": -0.0363,
      "num_tokens": 46279428.0,
      "reward": 0.046390168368816376,
      "reward_std": 0.04829657822847366,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0029335212893784046,
      "rewards/logprob_reward/std": 0.0032161264680325985,
      "step": 1612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 962.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 710.3125,
      "completions/mean_terminated_length": 710.3125,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.978395061728395,
      "grad_norm": 2.3356190397023013,
      "kl": 0.2431640625,
      "learning_rate": 3.1991020913890723e-11,
      "loss": -0.2064,
      "num_tokens": 46308822.0,
      "reward": 0.037897706031799316,
      "reward_std": 0.030762355774641037,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.003914116881787777,
      "rewards/logprob_reward/std": 0.004848901182413101,
      "step": 1613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 770.5625,
      "completions/mean_terminated_length": 723.629638671875,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 4.981481481481482,
      "grad_norm": 2.1778873910598273,
      "kl": 0.2266845703125,
      "learning_rate": 2.449324782183293e-11,
      "loss": -0.2537,
      "num_tokens": 46340384.0,
      "reward": 0.039623111486434937,
      "reward_std": 0.04137878119945526,
      "rewards/format_reward_func/mean": 0.375,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0023590121418237686,
      "rewards/logprob_reward/std": 0.003141367109492421,
      "step": 1614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 773.46875,
      "completions/mean_terminated_length": 765.3870849609375,
      "completions/min_length": 511.0,
      "completions/min_terminated_length": 511.0,
      "epoch": 4.984567901234568,
      "grad_norm": 1.7731235111279475,
      "kl": 0.2000732421875,
      "learning_rate": 1.799511717470725e-11,
      "loss": -0.1573,
      "num_tokens": 46371883.0,
      "reward": 0.055310651659965515,
      "reward_std": 0.053711898624897,
      "rewards/format_reward_func/mean": 0.53125,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.002428502542898059,
      "rewards/logprob_reward/std": 0.00242333160713315,
      "step": 1615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 761.0625,
      "completions/mean_terminated_length": 723.5000610351562,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 4.987654320987654,
      "grad_norm": 3.5376521395514438,
      "kl": 0.240234375,
      "learning_rate": 1.2496654958310537e-11,
      "loss": -0.4812,
      "num_tokens": 46402961.0,
      "reward": 0.035960301756858826,
      "reward_std": 0.04699845612049103,
      "rewards/format_reward_func/mean": 0.34375,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0017614441458135843,
      "rewards/logprob_reward/std": 0.002412389265373349,
      "step": 1616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 995.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 642.65625,
      "completions/mean_terminated_length": 642.65625,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.9907407407407405,
      "grad_norm": 2.83895635142214,
      "kl": 0.2552490234375,
      "learning_rate": 7.997883160748563e-12,
      "loss": -0.1941,
      "num_tokens": 46429974.0,
      "reward": 0.06556230783462524,
      "reward_std": 0.046890728175640106,
      "rewards/format_reward_func/mean": 0.625,
      "rewards/format_reward_func/std": 0.49186936020851135,
      "rewards/logprob_reward/mean": 0.0034025656059384346,
      "rewards/logprob_reward/std": 0.004946530796587467,
      "step": 1617
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 705.8125,
      "completions/mean_terminated_length": 672.8965454101562,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 4.993827160493828,
      "grad_norm": 1.7022382281741213,
      "kl": NaN,
      "learning_rate": 4.4988197724360465e-12,
      "loss": 0.0113,
      "num_tokens": 46459332.0,
      "reward": 0.04789476841688156,
      "reward_std": 0.03340337052941322,
      "rewards/format_reward_func/mean": 0.46875,
      "rewards/format_reward_func/std": 0.507007360458374,
      "rewards/logprob_reward/mean": 0.0011330802226439118,
      "rewards/logprob_reward/std": 0.0016689967596903443,
      "step": 1618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 733.5,
      "completions/mean_terminated_length": 714.1333618164062,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 4.996913580246914,
      "grad_norm": 2.1619800951258923,
      "kl": 0.2430419921875,
      "learning_rate": 1.9994787860133646e-12,
      "loss": -0.1459,
      "num_tokens": 46489456.0,
      "reward": 0.045339275151491165,
      "reward_std": 0.03515031188726425,
      "rewards/format_reward_func/mean": 0.4375,
      "rewards/format_reward_func/std": 0.504016101360321,
      "rewards/logprob_reward/mean": 0.0017658602446317673,
      "rewards/logprob_reward/std": 0.002554364735260606,
      "step": 1619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 946.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 644.125,
      "completions/mean_terminated_length": 644.125,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 5.0,
      "grad_norm": 2.1215296053549455,
      "kl": 0.2723388671875,
      "learning_rate": 4.998701962355412e-13,
      "loss": -0.0872,
      "num_tokens": 46516164.0,
      "reward": 0.06689511239528656,
      "reward_std": 0.026477597653865814,
      "rewards/format_reward_func/mean": 0.65625,
      "rewards/format_reward_func/std": 0.4825586974620819,
      "rewards/logprob_reward/mean": 0.0014112326316535473,
      "rewards/logprob_reward/std": 0.0015478808199986815,
      "step": 1620
    },
    {
      "epoch": 5.0,
      "step": 1620,
      "total_flos": 0.0,
      "train_loss": -0.14862212164189392,
      "train_runtime": 20021.4112,
      "train_samples_per_second": 0.647,
      "train_steps_per_second": 0.081
    }
  ],
  "logging_steps": 1,
  "max_steps": 1620,
  "num_input_tokens_seen": 46516164,
  "num_train_epochs": 5,
  "save_steps": 25,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}