{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 500,
  "global_step": 1620,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 529.5,
      "completions/mean_terminated_length": 496.5333557128906,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.0030864197530864196,
      "grad_norm": 4.74130083751676,
      "kl": NaN,
      "learning_rate": 0.0,
      "loss": -0.1875,
      "num_tokens": 23472.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 724.0,
      "completions/mean_length": 542.03125,
      "completions/mean_terminated_length": 509.9000244140625,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 0.006172839506172839,
      "grad_norm": 4.716907773314693,
      "kl": NaN,
      "learning_rate": 1.020408163265306e-08,
      "loss": -0.1875,
      "num_tokens": 47325.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 2
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 638.625,
      "completions/mean_terminated_length": 510.16668701171875,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 0.009259259259259259,
      "grad_norm": 5.525517617865517,
      "kl": NaN,
      "learning_rate": 2.040816326530612e-08,
      "loss": -0.1875,
      "num_tokens": 74461.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 3
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 760.0,
      "completions/max_terminated_length": 760.0,
      "completions/mean_length": 482.125,
      "completions/mean_terminated_length": 482.125,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.012345679012345678,
      "grad_norm": 5.694040850544329,
      "kl": NaN,
      "learning_rate": 3.0612244897959183e-08,
      "loss": -0.1875,
      "num_tokens": 96257.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 4
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 772.0,
      "completions/max_terminated_length": 772.0,
      "completions/mean_length": 514.28125,
      "completions/mean_terminated_length": 514.28125,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.015432098765432098,
      "grad_norm": 9.88473138977109,
      "kl": NaN,
      "learning_rate": 4.081632653061224e-08,
      "loss": -0.0183,
      "num_tokens": 118998.0,
      "reward": -1.3969838619232178e-09,
      "reward_std": 0.18550412356853485,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 5
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 801.0,
      "completions/mean_length": 507.625,
      "completions/mean_terminated_length": 473.20001220703125,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "epoch": 0.018518518518518517,
      "grad_norm": 5.90717969482145,
      "kl": NaN,
      "learning_rate": 5.1020408163265303e-08,
      "loss": -0.1875,
      "num_tokens": 141370.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 6
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 684.75,
      "completions/mean_terminated_length": 636.2857666015625,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 0.021604938271604937,
      "grad_norm": 9.60087477291155,
      "kl": NaN,
      "learning_rate": 6.122448979591837e-08,
      "loss": -0.0,
      "num_tokens": 170098.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 7
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 829.0,
      "completions/mean_length": 563.28125,
      "completions/mean_terminated_length": 532.5667114257812,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.024691358024691357,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 7.142857142857142e-08,
      "loss": 0.0,
      "num_tokens": 194259.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 8
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 813.0,
      "completions/max_terminated_length": 813.0,
      "completions/mean_length": 526.5,
      "completions/mean_terminated_length": 526.5,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.027777777777777776,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 8.163265306122448e-08,
      "loss": 0.0,
      "num_tokens": 217555.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 9
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 508.1875,
      "completions/mean_terminated_length": 473.8000183105469,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.030864197530864196,
      "grad_norm": 12.119144518696967,
      "kl": NaN,
      "learning_rate": 9.183673469387755e-08,
      "loss": -0.0,
      "num_tokens": 239785.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15130963921546936,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 10
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 898.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 539.78125,
      "completions/mean_terminated_length": 539.78125,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 0.033950617283950615,
      "grad_norm": 4.187066212925879,
      "kl": NaN,
      "learning_rate": 1.0204081632653061e-07,
      "loss": -0.1875,
      "num_tokens": 263486.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 11
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 516.71875,
      "completions/mean_terminated_length": 500.3548278808594,
      "completions/min_length": 304.0,
      "completions/min_terminated_length": 304.0,
      "epoch": 0.037037037037037035,
      "grad_norm": 6.532980810899672,
      "kl": NaN,
      "learning_rate": 1.1224489795918366e-07,
      "loss": -0.0,
      "num_tokens": 286193.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 12
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 820.0,
      "completions/max_terminated_length": 820.0,
      "completions/mean_length": 524.1875,
      "completions/mean_terminated_length": 524.1875,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.040123456790123455,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 1.2244897959183673e-07,
      "loss": 0.0,
      "num_tokens": 309231.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 13
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 755.0,
      "completions/mean_length": 507.75,
      "completions/mean_terminated_length": 473.3333435058594,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.043209876543209874,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 1.326530612244898e-07,
      "loss": 0.0,
      "num_tokens": 331919.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 14
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 528.0,
      "completions/mean_terminated_length": 494.933349609375,
      "completions/min_length": 300.0,
      "completions/min_terminated_length": 300.0,
      "epoch": 0.046296296296296294,
      "grad_norm": 9.610937197935552,
      "kl": NaN,
      "learning_rate": 1.4285714285714285e-07,
      "loss": -0.1703,
      "num_tokens": 355315.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.12426391243934631,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 15
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 779.0,
      "completions/mean_length": 540.90625,
      "completions/mean_terminated_length": 525.3225708007812,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.04938271604938271,
      "grad_norm": 3.5559481335414636,
      "kl": NaN,
      "learning_rate": 1.5306122448979592e-07,
      "loss": -0.1875,
      "num_tokens": 379508.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 16
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 741.0,
      "completions/mean_length": 520.9375,
      "completions/mean_terminated_length": 504.70965576171875,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.05246913580246913,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 1.6326530612244896e-07,
      "loss": 0.0,
      "num_tokens": 402982.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 17
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 827.0,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 452.90625,
      "completions/mean_terminated_length": 452.90625,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "epoch": 0.05555555555555555,
      "grad_norm": 0.0019381009413819638,
      "kl": NaN,
      "learning_rate": 1.7346938775510203e-07,
      "loss": 0.0,
      "num_tokens": 423743.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 18
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 736.0,
      "completions/max_terminated_length": 736.0,
      "completions/mean_length": 513.03125,
      "completions/mean_terminated_length": 513.03125,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 0.05864197530864197,
      "grad_norm": 3.8400628573522324,
      "kl": NaN,
      "learning_rate": 1.836734693877551e-07,
      "loss": -0.1875,
      "num_tokens": 446912.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 19
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 567.125,
      "completions/mean_terminated_length": 482.5185241699219,
      "completions/min_length": 267.0,
      "completions/min_terminated_length": 267.0,
      "epoch": 0.06172839506172839,
      "grad_norm": 3.969671416208202,
      "kl": NaN,
      "learning_rate": 1.9387755102040814e-07,
      "loss": -0.1875,
      "num_tokens": 471448.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 20
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 886.0,
      "completions/max_terminated_length": 886.0,
      "completions/mean_length": 511.4375,
      "completions/mean_terminated_length": 511.4375,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 0.06481481481481481,
      "grad_norm": 5.348841577705897,
      "kl": NaN,
      "learning_rate": 2.0408163265306121e-07,
      "loss": -0.1875,
      "num_tokens": 494254.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 21
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 550.875,
      "completions/mean_terminated_length": 535.6129150390625,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 0.06790123456790123,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 2.1428571428571426e-07,
      "loss": 0.0,
      "num_tokens": 519170.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 22
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 960.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 520.09375,
      "completions/mean_terminated_length": 520.09375,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 0.07098765432098765,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 2.2448979591836733e-07,
      "loss": 0.0,
      "num_tokens": 542389.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 23
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 744.0,
      "completions/mean_length": 494.8125,
      "completions/mean_terminated_length": 477.7419128417969,
      "completions/min_length": 269.0,
      "completions/min_terminated_length": 269.0,
      "epoch": 0.07407407407407407,
      "grad_norm": 5.883080414950403,
      "kl": NaN,
      "learning_rate": 2.346938775510204e-07,
      "loss": -0.1875,
      "num_tokens": 564475.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 24
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 846.0,
      "completions/mean_length": 543.6875,
      "completions/mean_terminated_length": 511.66668701171875,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.07716049382716049,
      "grad_norm": 4.292052701984573,
      "kl": NaN,
      "learning_rate": 2.4489795918367347e-07,
      "loss": -0.1875,
      "num_tokens": 588373.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 25
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 799.0,
      "completions/max_terminated_length": 799.0,
      "completions/mean_length": 458.1875,
      "completions/mean_terminated_length": 458.1875,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.08024691358024691,
      "grad_norm": 5.34225123241201,
      "kl": NaN,
      "learning_rate": 2.551020408163265e-07,
      "loss": -0.1874,
      "num_tokens": 609663.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 26
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 844.0,
      "completions/max_terminated_length": 844.0,
      "completions/mean_length": 499.46875,
      "completions/mean_terminated_length": 499.46875,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.08333333333333333,
      "grad_norm": 7.9853862621483795,
      "kl": NaN,
      "learning_rate": 2.653061224489796e-07,
      "loss": -0.0,
      "num_tokens": 632082.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 27
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 544.0625,
      "completions/mean_terminated_length": 528.5806274414062,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "epoch": 0.08641975308641975,
      "grad_norm": 4.542744467532849,
      "kl": NaN,
      "learning_rate": 2.755102040816326e-07,
      "loss": -0.1875,
      "num_tokens": 656264.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 28
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 731.0,
      "completions/mean_length": 485.0,
      "completions/mean_terminated_length": 467.6128845214844,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 0.08950617283950617,
      "grad_norm": 4.302072310973842,
      "kl": NaN,
      "learning_rate": 2.857142857142857e-07,
      "loss": -0.1875,
      "num_tokens": 677960.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 29
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 785.0,
      "completions/max_terminated_length": 785.0,
      "completions/mean_length": 557.40625,
      "completions/mean_terminated_length": 557.40625,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 0.09259259259259259,
      "grad_norm": 6.465867393978447,
      "kl": NaN,
      "learning_rate": 2.9591836734693874e-07,
      "loss": -0.0,
      "num_tokens": 702525.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 30
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 804.0,
      "completions/mean_length": 560.625,
      "completions/mean_terminated_length": 529.7333374023438,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 0.09567901234567901,
      "grad_norm": 9.204797689062591,
      "kl": NaN,
      "learning_rate": 3.0612244897959183e-07,
      "loss": 0.187,
      "num_tokens": 726877.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1350192129611969,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 31
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 483.65625,
      "completions/mean_terminated_length": 466.2257995605469,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.09876543209876543,
      "grad_norm": 5.047012153592003,
      "kl": NaN,
      "learning_rate": 3.163265306122449e-07,
      "loss": -0.1875,
      "num_tokens": 748570.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 32
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 614.09375,
      "completions/mean_terminated_length": 555.5357666015625,
      "completions/min_length": 289.0,
      "completions/min_terminated_length": 289.0,
      "epoch": 0.10185185185185185,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 3.265306122448979e-07,
      "loss": 0.0,
      "num_tokens": 775373.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 33
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 542.53125,
      "completions/mean_terminated_length": 527.0,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.10493827160493827,
      "grad_norm": 5.930133727857815,
      "kl": NaN,
      "learning_rate": 3.3673469387755096e-07,
      "loss": -0.1875,
      "num_tokens": 798974.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 34
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 829.0,
      "completions/mean_length": 502.34375,
      "completions/mean_terminated_length": 485.51611328125,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.10802469135802469,
      "grad_norm": 10.104299577868375,
      "kl": NaN,
      "learning_rate": 3.4693877551020406e-07,
      "loss": -0.1874,
      "num_tokens": 821625.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.14388087391853333,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 35
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 739.0,
      "completions/max_terminated_length": 739.0,
      "completions/mean_length": 464.15625,
      "completions/mean_terminated_length": 464.15625,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.1111111111111111,
      "grad_norm": 7.848102759829451,
      "kl": NaN,
      "learning_rate": 3.5714285714285716e-07,
      "loss": -0.3749,
      "num_tokens": 842938.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 36
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 942.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 522.4375,
      "completions/mean_terminated_length": 522.4375,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 0.11419753086419752,
      "grad_norm": 1.7514018540364926,
      "kl": NaN,
      "learning_rate": 3.673469387755102e-07,
      "loss": -0.0719,
      "num_tokens": 865932.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.07739149034023285,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 37
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 616.0,
      "completions/mean_terminated_length": 557.7142944335938,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 0.11728395061728394,
      "grad_norm": 8.139154074899833,
      "kl": NaN,
      "learning_rate": 3.7755102040816324e-07,
      "loss": -0.1875,
      "num_tokens": 892432.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 38
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 780.0,
      "completions/max_terminated_length": 780.0,
      "completions/mean_length": 514.34375,
      "completions/mean_terminated_length": 514.34375,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 0.12037037037037036,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 3.877551020408163e-07,
      "loss": 0.0,
      "num_tokens": 915023.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 39
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 709.0,
      "completions/mean_length": 538.71875,
      "completions/mean_terminated_length": 488.5172424316406,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.12345679012345678,
      "grad_norm": 10.38047968051339,
      "kl": NaN,
      "learning_rate": 3.979591836734694e-07,
      "loss": -0.1875,
      "num_tokens": 939246.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 40
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 731.0,
      "completions/mean_length": 539.21875,
      "completions/mean_terminated_length": 489.0689697265625,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "epoch": 0.12654320987654322,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 4.0816326530612243e-07,
      "loss": 0.0,
      "num_tokens": 963425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 41
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 681.0,
      "completions/mean_length": 481.46875,
      "completions/mean_terminated_length": 425.3448181152344,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.12962962962962962,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 4.183673469387755e-07,
      "loss": 0.0,
      "num_tokens": 985176.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 42
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 577.1875,
      "completions/mean_terminated_length": 547.4000244140625,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "epoch": 0.13271604938271606,
      "grad_norm": 4.9499267033359216,
      "kl": NaN,
      "learning_rate": 4.285714285714285e-07,
      "loss": -0.1875,
      "num_tokens": 1010554.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 43
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 833.0,
      "completions/max_terminated_length": 833.0,
      "completions/mean_length": 489.21875,
      "completions/mean_terminated_length": 489.21875,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.13580246913580246,
      "grad_norm": 7.089035443815076,
      "kl": NaN,
      "learning_rate": 4.387755102040816e-07,
      "loss": -0.1875,
      "num_tokens": 1032629.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 44
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 631.0,
      "completions/max_terminated_length": 631.0,
      "completions/mean_length": 473.34375,
      "completions/mean_terminated_length": 473.34375,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.1388888888888889,
      "grad_norm": 5.925047387295865,
      "kl": NaN,
      "learning_rate": 4.4897959183673465e-07,
      "loss": -0.0,
      "num_tokens": 1054348.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 45
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 594.375,
      "completions/mean_terminated_length": 514.8148193359375,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.1419753086419753,
      "grad_norm": 8.248861691563496,
      "kl": NaN,
      "learning_rate": 4.5918367346938775e-07,
      "loss": -0.1875,
      "num_tokens": 1080012.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 46
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 792.0,
      "completions/mean_length": 525.75,
      "completions/mean_terminated_length": 492.5333557128906,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "epoch": 0.14506172839506173,
      "grad_norm": 3.3479654261151626,
      "kl": NaN,
      "learning_rate": 4.693877551020408e-07,
      "loss": -0.1875,
      "num_tokens": 1103256.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 47
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 551.75,
      "completions/mean_terminated_length": 502.89654541015625,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.14814814814814814,
      "grad_norm": 4.134182401036001,
      "kl": NaN,
      "learning_rate": 4.795918367346938e-07,
      "loss": -0.1875,
      "num_tokens": 1127392.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 48
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 560.875,
      "completions/mean_terminated_length": 530.0,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "epoch": 0.15123456790123457,
      "grad_norm": 7.640595895124005,
      "kl": NaN,
      "learning_rate": 4.897959183673469e-07,
      "loss": -0.0,
      "num_tokens": 1152032.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 49
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 710.0,
      "completions/mean_length": 559.15625,
      "completions/mean_terminated_length": 492.7500305175781,
      "completions/min_length": 283.0,
      "completions/min_terminated_length": 283.0,
      "epoch": 0.15432098765432098,
      "grad_norm": 4.335900513254463,
      "kl": NaN,
      "learning_rate": 5e-07,
      "loss": -0.1875,
      "num_tokens": 1176601.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 50
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 682.0,
      "completions/max_terminated_length": 682.0,
      "completions/mean_length": 461.59375,
      "completions/mean_terminated_length": 461.59375,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 0.1574074074074074,
      "grad_norm": 6.874486086709533,
      "kl": NaN,
      "learning_rate": 4.999995001298037e-07,
      "loss": 0.1054,
      "num_tokens": 1197684.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14500659704208374,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 51
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 555.09375,
      "completions/mean_terminated_length": 539.9677124023438,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.16049382716049382,
      "grad_norm": 5.888160885729557,
      "kl": NaN,
      "learning_rate": 4.99998000521214e-07,
      "loss": 0.1873,
      "num_tokens": 1222175.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13756419718265533,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 52
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 523.8125,
      "completions/mean_terminated_length": 490.4667053222656,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.16358024691358025,
      "grad_norm": 6.895723747555437,
      "kl": NaN,
      "learning_rate": 4.999955011802275e-07,
      "loss": -0.1875,
      "num_tokens": 1245153.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 53
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 872.0,
      "completions/mean_length": 497.96875,
      "completions/mean_terminated_length": 481.0,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.16666666666666666,
      "grad_norm": 0.0,
      "kl": NaN,
      "learning_rate": 4.999920021168393e-07,
      "loss": 0.0,
      "num_tokens": 1267236.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 54
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 892.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 506.59375,
      "completions/mean_terminated_length": 506.59375,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "epoch": 0.1697530864197531,
      "grad_norm": 11.542699057812628,
      "kl": NaN,
      "learning_rate": 4.999875033450417e-07,
      "loss": -0.2016,
      "num_tokens": 1290135.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.23362484574317932,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 55
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 580.3125,
      "completions/mean_terminated_length": 550.7333374023438,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 0.1728395061728395,
      "grad_norm": 6.082929826276223,
      "kl": NaN,
      "learning_rate": 4.999820048828253e-07,
      "loss": -0.0,
      "num_tokens": 1316181.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 56
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 793.0,
      "completions/mean_length": 537.40625,
      "completions/mean_terminated_length": 487.0689697265625,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 0.17592592592592593,
      "grad_norm": 3.300891508456188,
      "kl": NaN,
      "learning_rate": 4.999755067521781e-07,
      "loss": -0.1875,
      "num_tokens": 1339862.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 57
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 772.0,
      "completions/mean_length": 510.28125,
      "completions/mean_terminated_length": 476.0333557128906,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "epoch": 0.17901234567901234,
      "grad_norm": 10.718095689791374,
      "kl": NaN,
      "learning_rate": 4.999680089790861e-07,
      "loss": -0.1875,
      "num_tokens": 1363267.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 58
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 806.0,
      "completions/max_terminated_length": 806.0,
      "completions/mean_length": 480.3125,
      "completions/mean_terminated_length": 480.3125,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.18209876543209877,
      "grad_norm": 5.638279253022429,
      "kl": NaN,
      "learning_rate": 4.999595115935325e-07,
      "loss": -0.0,
      "num_tokens": 1385133.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 59
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 764.0,
      "completions/max_terminated_length": 764.0,
      "completions/mean_length": 489.03125,
      "completions/mean_terminated_length": 489.03125,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.18518518518518517,
      "grad_norm": 5.361759756119009,
      "kl": NaN,
      "learning_rate": 4.999500146294979e-07,
      "loss": -0.3749,
      "num_tokens": 1406898.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 60
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 609.25,
      "completions/mean_terminated_length": 595.8709716796875,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 0.1882716049382716,
      "grad_norm": 8.180312606246089,
      "kl": NaN,
      "learning_rate": 4.999395181249604e-07,
      "loss": 0.187,
      "num_tokens": 1433194.0,
      "reward": 0.0,
      "reward_std": 0.15781110525131226,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 61
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 475.28125,
      "completions/mean_terminated_length": 457.58062744140625,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.19135802469135801,
      "grad_norm": 9.370486688856312,
      "kl": NaN,
      "learning_rate": 4.99928022121895e-07,
      "loss": 0.1869,
      "num_tokens": 1454883.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13469690084457397,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 62
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 705.0,
      "completions/mean_length": 527.875,
      "completions/mean_terminated_length": 476.5517272949219,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "epoch": 0.19444444444444445,
      "grad_norm": 4.488958880852314,
      "kl": NaN,
      "learning_rate": 4.99915526666274e-07,
      "loss": -0.0793,
      "num_tokens": 1478499.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.12216030061244965,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 63
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 556.65625,
      "completions/mean_terminated_length": 525.5,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.19753086419753085,
      "grad_norm": 8.177889428278384,
      "kl": NaN,
      "learning_rate": 4.999020318080661e-07,
      "loss": 0.2708,
      "num_tokens": 1502564.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.23764848709106445,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909375190735,
      "step": 64
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 716.0,
      "completions/max_terminated_length": 716.0,
      "completions/mean_length": 442.84375,
      "completions/mean_terminated_length": 442.84375,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.2006172839506173,
      "grad_norm": 8.354373096484785,
      "kl": NaN,
      "learning_rate": 4.998875376012368e-07,
      "loss": 0.1869,
      "num_tokens": 1522991.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.13481050729751587,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -2.7939677238464355e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 65
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 782.0,
      "completions/mean_length": 517.96875,
      "completions/mean_terminated_length": 484.2333679199219,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.2037037037037037,
      "grad_norm": 9.443968749041115,
      "kl": NaN,
      "learning_rate": 4.998720441037479e-07,
      "loss": 0.1873,
      "num_tokens": 1546154.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1590980887413025,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 66
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 582.15625,
      "completions/mean_terminated_length": 519.0357666015625,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.20679012345679013,
      "grad_norm": 6.238242069432459,
      "kl": NaN,
      "learning_rate": 4.99855551377557e-07,
      "loss": 0.0607,
      "num_tokens": 1571659.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.14501814544200897,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 67
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 459.90625,
      "completions/mean_terminated_length": 441.70965576171875,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.20987654320987653,
      "grad_norm": 6.357087874651633,
      "kl": NaN,
      "learning_rate": 4.998380594886182e-07,
      "loss": 0.0735,
      "num_tokens": 1592716.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.12894144654273987,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 68
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 860.0,
      "completions/max_terminated_length": 860.0,
      "completions/mean_length": 518.375,
      "completions/mean_terminated_length": 518.375,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 0.21296296296296297,
      "grad_norm": 8.525097137526107,
      "kl": NaN,
      "learning_rate": 4.998195685068808e-07,
      "loss": -0.1602,
      "num_tokens": 1615664.0,
      "reward": 0.0,
      "reward_std": 0.22717738151550293,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 69
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 549.21875,
      "completions/mean_terminated_length": 517.5667114257812,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.21604938271604937,
      "grad_norm": 11.023532351660702,
      "kl": NaN,
      "learning_rate": 4.998000785062895e-07,
      "loss": -0.0997,
      "num_tokens": 1640415.0,
      "reward": 0.0,
      "reward_std": 0.15491779148578644,
      "rewards/format_reward_func/mean": -2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 70
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 589.1875,
      "completions/mean_terminated_length": 527.0714721679688,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.2191358024691358,
      "grad_norm": 9.95859480120045,
      "kl": NaN,
      "learning_rate": 4.997795895647841e-07,
      "loss": -0.0,
      "num_tokens": 1666257.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 71
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 770.0,
      "completions/mean_length": 427.78125,
      "completions/mean_terminated_length": 408.5483703613281,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.2222222222222222,
      "grad_norm": 9.711779424497152,
      "kl": NaN,
      "learning_rate": 4.997581017642991e-07,
      "loss": -0.3742,
      "num_tokens": 1686250.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18278822302818298,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 72
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 734.0,
      "completions/mean_length": 555.96875,
      "completions/mean_terminated_length": 447.9615478515625,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.22530864197530864,
      "grad_norm": 12.908124001807106,
      "kl": NaN,
      "learning_rate": 4.997356151907633e-07,
      "loss": 0.1446,
      "num_tokens": 1710849.0,
      "reward": -4.6566128730773926e-09,
      "reward_std": 0.22358503937721252,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 73
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 442.53125,
      "completions/mean_terminated_length": 423.774169921875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.22839506172839505,
      "grad_norm": 8.08954496023087,
      "kl": NaN,
      "learning_rate": 4.997121299340997e-07,
      "loss": 0.3373,
      "num_tokens": 1731202.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1365959793329239,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 74
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 780.0,
      "completions/mean_length": 527.40625,
      "completions/mean_terminated_length": 494.3000183105469,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.23148148148148148,
      "grad_norm": 7.5287129512560345,
      "kl": NaN,
      "learning_rate": 4.99687646088225e-07,
      "loss": 0.4762,
      "num_tokens": 1754607.0,
      "reward": 1.1175870895385742e-08,
      "reward_std": 0.22805829346179962,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5388159155845642,
      "step": 75
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 706.0,
      "completions/mean_length": 444.34375,
      "completions/mean_terminated_length": 425.6451416015625,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.2345679012345679,
      "grad_norm": 6.557388492091582,
      "kl": NaN,
      "learning_rate": 4.996621637510491e-07,
      "loss": 0.2711,
      "num_tokens": 1775542.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.14887697994709015,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 76
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 525.8125,
      "completions/mean_terminated_length": 509.7419128417969,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.23765432098765432,
      "grad_norm": 5.279569116437037,
      "kl": NaN,
      "learning_rate": 4.996356830244749e-07,
      "loss": -0.1931,
      "num_tokens": 1799104.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.19772586226463318,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 5.587935447692871e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 77
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 483.03125,
      "completions/mean_terminated_length": 465.58062744140625,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.24074074074074073,
      "grad_norm": 7.532480100642041,
      "kl": NaN,
      "learning_rate": 4.996082040143977e-07,
      "loss": 0.0932,
      "num_tokens": 1820565.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.21790876984596252,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 78
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 525.40625,
      "completions/mean_terminated_length": 454.1785888671875,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.24382716049382716,
      "grad_norm": 2.2175718648911444,
      "kl": NaN,
      "learning_rate": 4.995797268307051e-07,
      "loss": -0.0517,
      "num_tokens": 1843894.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 79
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 873.0,
      "completions/mean_length": 552.6875,
      "completions/mean_terminated_length": 465.40740966796875,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.24691358024691357,
      "grad_norm": 2.7112926160139468,
      "kl": NaN,
      "learning_rate": 4.995502515872763e-07,
      "loss": -0.0311,
      "num_tokens": 1868112.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.06587611883878708,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 80
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 751.0,
      "completions/mean_length": 493.15625,
      "completions/mean_terminated_length": 476.0322570800781,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.25,
      "grad_norm": 7.540066725909991,
      "kl": NaN,
      "learning_rate": 4.995197784019818e-07,
      "loss": -0.374,
      "num_tokens": 1890461.0,
      "reward": 0.0,
      "reward_std": 0.17774741351604462,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 81
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 770.0,
      "completions/mean_length": 553.46875,
      "completions/mean_terminated_length": 504.7930908203125,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0.25308641975308643,
      "grad_norm": 8.277666974684571,
      "kl": NaN,
      "learning_rate": 4.994883073966823e-07,
      "loss": 0.0969,
      "num_tokens": 1914820.0,
      "reward": 0.0,
      "reward_std": 0.1777360737323761,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 82
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 537.15625,
      "completions/mean_terminated_length": 521.4515991210938,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 0.25617283950617287,
      "grad_norm": 6.7740350517861865,
      "kl": NaN,
      "learning_rate": 4.994558386972295e-07,
      "loss": -0.4215,
      "num_tokens": 1938793.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.21613164246082306,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 83
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 785.0,
      "completions/mean_length": 458.1875,
      "completions/mean_terminated_length": 420.4666748046875,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 0.25925925925925924,
      "grad_norm": 2.6867147276116556,
      "kl": NaN,
      "learning_rate": 4.994223724334643e-07,
      "loss": -0.055,
      "num_tokens": 1959735.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.1328701227903366,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 84
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 738.0,
      "completions/mean_length": 527.21875,
      "completions/mean_terminated_length": 475.82757568359375,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.2623456790123457,
      "grad_norm": 5.6286038551084685,
      "kl": NaN,
      "learning_rate": 4.99387908739217e-07,
      "loss": -0.3749,
      "num_tokens": 1983126.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 85
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 741.0,
      "completions/mean_length": 522.34375,
      "completions/mean_terminated_length": 488.9000244140625,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.2654320987654321,
      "grad_norm": 9.699949499271524,
      "kl": NaN,
      "learning_rate": 4.993524477523067e-07,
      "loss": -0.0421,
      "num_tokens": 2006269.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.2729633152484894,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 86
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 797.0,
      "completions/mean_length": 624.6875,
      "completions/mean_terminated_length": 550.74072265625,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 0.26851851851851855,
      "grad_norm": 6.430087493862934,
      "kl": NaN,
      "learning_rate": 4.993159896145405e-07,
      "loss": 0.1025,
      "num_tokens": 2033575.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1897057294845581,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 87
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 481.5625,
      "completions/mean_terminated_length": 445.4000244140625,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.2716049382716049,
      "grad_norm": 11.45326297061015,
      "kl": NaN,
      "learning_rate": 4.99278534471713e-07,
      "loss": -0.0001,
      "num_tokens": 2055329.0,
      "reward": 0.0,
      "reward_std": 0.1972888708114624,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 88
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 760.0,
      "completions/mean_length": 581.125,
      "completions/mean_terminated_length": 499.1111145019531,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.27469135802469136,
      "grad_norm": 10.331998309422328,
      "kl": NaN,
      "learning_rate": 4.992400824736059e-07,
      "loss": 0.3698,
      "num_tokens": 2080901.0,
      "reward": 0.0,
      "reward_std": 0.17625850439071655,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 89
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 884.0,
      "completions/mean_length": 594.5,
      "completions/mean_terminated_length": 514.9629516601562,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "epoch": 0.2777777777777778,
      "grad_norm": 11.165598362989805,
      "kl": NaN,
      "learning_rate": 4.992006337739874e-07,
      "loss": -0.0311,
      "num_tokens": 2106693.0,
      "reward": 1.4901161193847656e-08,
      "reward_std": 0.31283944845199585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 90
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 519.8125,
      "completions/mean_terminated_length": 486.20001220703125,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "epoch": 0.2808641975308642,
      "grad_norm": 13.002113964292453,
      "kl": NaN,
      "learning_rate": 4.991601885306111e-07,
      "loss": 0.3115,
      "num_tokens": 2129763.0,
      "reward": 1.4901161193847656e-08,
      "reward_std": 0.2638370990753174,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5388159155845642,
      "step": 91
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 774.0,
      "completions/mean_length": 555.15625,
      "completions/mean_terminated_length": 488.1785888671875,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.2839506172839506,
      "grad_norm": 12.791846118361295,
      "kl": NaN,
      "learning_rate": 4.991187469052162e-07,
      "loss": 0.2108,
      "num_tokens": 2154216.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.2904241681098938,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.5388159155845642,
      "step": 92
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 836.0,
      "completions/mean_length": 536.5,
      "completions/mean_terminated_length": 504.0000305175781,
      "completions/min_length": 267.0,
      "completions/min_terminated_length": 267.0,
      "epoch": 0.28703703703703703,
      "grad_norm": 2.792875412014937,
      "kl": NaN,
      "learning_rate": 4.99076309063526e-07,
      "loss": -0.0838,
      "num_tokens": 2177772.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2394038587808609,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 93
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 734.0,
      "completions/mean_length": 547.1875,
      "completions/mean_terminated_length": 497.862060546875,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.29012345679012347,
      "grad_norm": 9.757143632177169,
      "kl": NaN,
      "learning_rate": 4.99032875175248e-07,
      "loss": -0.1874,
      "num_tokens": 2201578.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18489786982536316,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.3969838619232178e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 94
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 550.875,
      "completions/mean_terminated_length": 463.25927734375,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.2932098765432099,
      "grad_norm": 8.9755366295973,
      "kl": NaN,
      "learning_rate": 4.989884454140724e-07,
      "loss": 0.4945,
      "num_tokens": 2225358.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.16876746714115143,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 95
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 886.0,
      "completions/mean_length": 471.71875,
      "completions/mean_terminated_length": 453.9031982421875,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.2962962962962963,
      "grad_norm": 9.55185606564022,
      "kl": NaN,
      "learning_rate": 4.989430199576722e-07,
      "loss": 0.2836,
      "num_tokens": 2246301.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.29134178161621094,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 96
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 512.84375,
      "completions/mean_terminated_length": 478.7666931152344,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.2993827160493827,
      "grad_norm": 7.801768149510328,
      "kl": NaN,
      "learning_rate": 4.988965989877022e-07,
      "loss": -0.1874,
      "num_tokens": 2268696.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.16211099922657013,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 97
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 771.0,
      "completions/mean_length": 547.46875,
      "completions/mean_terminated_length": 515.7000122070312,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 0.30246913580246915,
      "grad_norm": 8.110425669427034,
      "kl": NaN,
      "learning_rate": 4.988491826897978e-07,
      "loss": 0.2677,
      "num_tokens": 2292971.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.1473100185394287,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 98
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 866.0,
      "completions/max_terminated_length": 866.0,
      "completions/mean_length": 499.5,
      "completions/mean_terminated_length": 499.5,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.3055555555555556,
      "grad_norm": 3.1063567716739673,
      "kl": NaN,
      "learning_rate": 4.988007712535752e-07,
      "loss": -0.0527,
      "num_tokens": 2315347.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.1585049033164978,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 99
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 763.0,
      "completions/mean_length": 574.71875,
      "completions/mean_terminated_length": 510.5357360839844,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.30864197530864196,
      "grad_norm": 5.620277176386703,
      "kl": NaN,
      "learning_rate": 4.987513648726298e-07,
      "loss": -0.247,
      "num_tokens": 2340430.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 100
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 719.0,
      "completions/mean_length": 551.3125,
      "completions/mean_terminated_length": 483.7857360839844,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.3117283950617284,
      "grad_norm": 11.872955258019072,
      "kl": NaN,
      "learning_rate": 4.987009637445358e-07,
      "loss": 0.1215,
      "num_tokens": 2364524.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.3255133628845215,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 101
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 862.0,
      "completions/mean_length": 566.84375,
      "completions/mean_terminated_length": 519.5516967773438,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.3148148148148148,
      "grad_norm": 11.232002516340126,
      "kl": NaN,
      "learning_rate": 4.986495680708453e-07,
      "loss": 0.584,
      "num_tokens": 2389303.0,
      "reward": 0.0,
      "reward_std": 0.25867360830307007,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5388159155845642,
      "step": 102
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 829.0,
      "completions/mean_length": 550.03125,
      "completions/mean_terminated_length": 482.3214416503906,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.31790123456790126,
      "grad_norm": 9.777628123938609,
      "kl": NaN,
      "learning_rate": 4.985971780570878e-07,
      "loss": -0.4652,
      "num_tokens": 2413348.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.35007143020629883,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5388159155845642,
      "step": 103
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 524.625,
      "completions/mean_terminated_length": 491.3333740234375,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.32098765432098764,
      "grad_norm": 9.424241070510574,
      "kl": NaN,
      "learning_rate": 4.985437939127687e-07,
      "loss": -0.3748,
      "num_tokens": 2437404.0,
      "reward": -5.122274160385132e-09,
      "reward_std": 0.1748603880405426,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 104
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 625.5,
      "completions/mean_terminated_length": 533.5384521484375,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.32407407407407407,
      "grad_norm": 6.171494848229457,
      "kl": NaN,
      "learning_rate": 4.984894158513696e-07,
      "loss": 0.0388,
      "num_tokens": 2464428.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.24299165606498718,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 105
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 829.0,
      "completions/mean_length": 495.9375,
      "completions/mean_terminated_length": 460.7333679199219,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0.3271604938271605,
      "grad_norm": 11.006528150853264,
      "kl": NaN,
      "learning_rate": 4.984340440903456e-07,
      "loss": 0.0792,
      "num_tokens": 2486418.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.34472817182540894,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 106
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 828.0,
      "completions/mean_length": 529.3125,
      "completions/mean_terminated_length": 478.137939453125,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "epoch": 0.33024691358024694,
      "grad_norm": 8.69266245923211,
      "kl": NaN,
      "learning_rate": 4.983776788511268e-07,
      "loss": 0.4456,
      "num_tokens": 2509496.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18249543011188507,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 107
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 531.8125,
      "completions/mean_terminated_length": 515.9354858398438,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 0.3333333333333333,
      "grad_norm": 6.409955823006029,
      "kl": NaN,
      "learning_rate": 4.983203203591154e-07,
      "loss": 0.3638,
      "num_tokens": 2532630.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.15304797887802124,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 108
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 867.0,
      "completions/mean_length": 566.53125,
      "completions/mean_terminated_length": 551.774169921875,
      "completions/min_length": 221.0,
      "completions/min_terminated_length": 221.0,
      "epoch": 0.33641975308641975,
      "grad_norm": 7.052145533314692,
      "kl": NaN,
      "learning_rate": 4.982619688436859e-07,
      "loss": -0.1236,
      "num_tokens": 2557651.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2175247073173523,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 109
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 847.0,
      "completions/max_terminated_length": 847.0,
      "completions/mean_length": 554.15625,
      "completions/mean_terminated_length": 554.15625,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.3395061728395062,
      "grad_norm": 5.6505366827006736,
      "kl": NaN,
      "learning_rate": 4.982026245381837e-07,
      "loss": 0.1873,
      "num_tokens": 2581932.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.12099941819906235,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 110
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 849.0,
      "completions/mean_length": 520.875,
      "completions/mean_terminated_length": 468.82757568359375,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 0.3425925925925926,
      "grad_norm": 4.990668417433553,
      "kl": NaN,
      "learning_rate": 4.981422876799244e-07,
      "loss": -0.125,
      "num_tokens": 2605524.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.36357975006103516,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5388159155845642,
      "step": 111
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 719.0,
      "completions/max_terminated_length": 719.0,
      "completions/mean_length": 447.4375,
      "completions/mean_terminated_length": 447.4375,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.345679012345679,
      "grad_norm": 8.097445924579832,
      "kl": NaN,
      "learning_rate": 4.980809585101927e-07,
      "loss": 0.1437,
      "num_tokens": 2626234.0,
      "reward": 0.0,
      "reward_std": 0.24270951747894287,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909375190735,
      "step": 112
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 607.0,
      "completions/mean_terminated_length": 510.7692565917969,
      "completions/min_length": 221.0,
      "completions/min_terminated_length": 221.0,
      "epoch": 0.3487654320987654,
      "grad_norm": 9.486949821902869,
      "kl": NaN,
      "learning_rate": 4.980186372742417e-07,
      "loss": -0.4536,
      "num_tokens": 2652698.0,
      "reward": 0.0,
      "reward_std": 0.2161448448896408,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4751909375190735,
      "step": 113
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 560.90625,
      "completions/mean_terminated_length": 530.0333862304688,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.35185185185185186,
      "grad_norm": 8.181338165034866,
      "kl": NaN,
      "learning_rate": 4.979553242212917e-07,
      "loss": 0.1245,
      "num_tokens": 2677363.0,
      "reward": 0.0,
      "reward_std": 0.27589231729507446,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 114
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 762.0,
      "completions/mean_length": 602.90625,
      "completions/mean_terminated_length": 485.0,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 0.3549382716049383,
      "grad_norm": 7.060827852615143,
      "kl": NaN,
      "learning_rate": 4.978910196045291e-07,
      "loss": -0.1508,
      "num_tokens": 2702860.0,
      "reward": 6.51925802230835e-09,
      "reward_std": 0.26213163137435913,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 115
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 523.59375,
      "completions/mean_terminated_length": 471.82757568359375,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.35802469135802467,
      "grad_norm": 5.58331507303849,
      "kl": NaN,
      "learning_rate": 4.978257236811055e-07,
      "loss": -0.0721,
      "num_tokens": 2725663.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.13394224643707275,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 116
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 580.0,
      "completions/mean_terminated_length": 534.0689697265625,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.3611111111111111,
      "grad_norm": 5.442727785309853,
      "kl": NaN,
      "learning_rate": 4.977594367121369e-07,
      "loss": -0.0725,
      "num_tokens": 2750763.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20845584571361542,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 117
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 608.40625,
      "completions/mean_terminated_length": 492.03997802734375,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.36419753086419754,
      "grad_norm": 7.343243332926292,
      "kl": NaN,
      "learning_rate": 4.976921589627021e-07,
      "loss": -0.0781,
      "num_tokens": 2776880.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.23553408682346344,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 118
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 542.5625,
      "completions/mean_terminated_length": 527.0322265625,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 0.36728395061728397,
      "grad_norm": 5.909938660860529,
      "kl": NaN,
      "learning_rate": 4.976238907018427e-07,
      "loss": 0.3108,
      "num_tokens": 2800338.0,
      "reward": 0.0,
      "reward_std": 0.1802201271057129,
      "rewards/format_reward_func/mean": -2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096293926239,
      "step": 119
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 556.0,
      "completions/mean_terminated_length": 507.5862121582031,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.37037037037037035,
      "grad_norm": 7.084304579069176,
      "kl": NaN,
      "learning_rate": 4.975546322025605e-07,
      "loss": 0.1766,
      "num_tokens": 2824806.0,
      "reward": 0.0,
      "reward_std": 0.19519078731536865,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 120
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 843.0,
      "completions/mean_length": 534.125,
      "completions/mean_terminated_length": 518.3225708007812,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.3734567901234568,
      "grad_norm": 8.40579312165444,
      "kl": NaN,
      "learning_rate": 4.974843837418175e-07,
      "loss": -0.2458,
      "num_tokens": 2848610.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.30575814843177795,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 121
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 602.5,
      "completions/mean_terminated_length": 542.2857666015625,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.3765432098765432,
      "grad_norm": 5.7832704011193865,
      "kl": NaN,
      "learning_rate": 4.974131456005349e-07,
      "loss": 0.2561,
      "num_tokens": 2874374.0,
      "reward": -1.4901161193847656e-08,
      "reward_std": 0.20695728063583374,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 122
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 674.90625,
      "completions/mean_terminated_length": 610.25927734375,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 0.37962962962962965,
      "grad_norm": 5.532237037603806,
      "kl": NaN,
      "learning_rate": 4.973409180635911e-07,
      "loss": -0.1752,
      "num_tokens": 2903135.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18403783440589905,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 123
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 847.0,
      "completions/mean_length": 619.9375,
      "completions/mean_terminated_length": 526.6923217773438,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 0.38271604938271603,
      "grad_norm": 7.290631465314015,
      "kl": NaN,
      "learning_rate": 4.972677014198213e-07,
      "loss": -0.3747,
      "num_tokens": 2929477.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.19752418994903564,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 124
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 823.0,
      "completions/mean_length": 555.5,
      "completions/mean_terminated_length": 524.2667236328125,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 0.38580246913580246,
      "grad_norm": 5.688696440304526,
      "kl": NaN,
      "learning_rate": 4.97193495962016e-07,
      "loss": 0.1632,
      "num_tokens": 2953441.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.20944499969482422,
      "rewards/format_reward_func/mean": -3.725290298461914e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 125
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 877.0,
      "completions/mean_length": 595.34375,
      "completions/mean_terminated_length": 534.107177734375,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 0.3888888888888889,
      "grad_norm": 4.705895167184754,
      "kl": NaN,
      "learning_rate": 4.971183019869201e-07,
      "loss": -0.1042,
      "num_tokens": 2978668.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.17375023663043976,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 126
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 719.0,
      "completions/mean_length": 598.125,
      "completions/mean_terminated_length": 537.2857666015625,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 0.39197530864197533,
      "grad_norm": 7.694546935881251,
      "kl": NaN,
      "learning_rate": 4.970421197952311e-07,
      "loss": 0.0792,
      "num_tokens": 3004416.0,
      "reward": 0.0,
      "reward_std": 0.1776627004146576,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 127
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 901.0,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 544.46875,
      "completions/mean_terminated_length": 544.46875,
      "completions/min_length": 283.0,
      "completions/min_terminated_length": 283.0,
      "epoch": 0.3950617283950617,
      "grad_norm": 6.548991620439674,
      "kl": NaN,
      "learning_rate": 4.969649496915991e-07,
      "loss": -0.2703,
      "num_tokens": 3028467.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.3327018618583679,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5679618120193481,
      "step": 128
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 808.0,
      "completions/mean_length": 559.71875,
      "completions/mean_terminated_length": 544.741943359375,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.39814814814814814,
      "grad_norm": 8.554204995566714,
      "kl": NaN,
      "learning_rate": 4.96886791984624e-07,
      "loss": -0.0874,
      "num_tokens": 3053234.0,
      "reward": 0.0,
      "reward_std": 0.2549504041671753,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 129
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 845.0,
      "completions/mean_length": 570.6875,
      "completions/mean_terminated_length": 540.4666748046875,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.4012345679012346,
      "grad_norm": 8.127682526083769,
      "kl": NaN,
      "learning_rate": 4.968076469868558e-07,
      "loss": 0.3081,
      "num_tokens": 3078396.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20096953213214874,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.43994131684303284,
      "step": 130
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 734.0,
      "completions/mean_length": 489.375,
      "completions/mean_terminated_length": 472.1290283203125,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 0.404320987654321,
      "grad_norm": 6.821984180324421,
      "kl": NaN,
      "learning_rate": 4.967275150147921e-07,
      "loss": -0.3931,
      "num_tokens": 3100128.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1411939263343811,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 131
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 613.0,
      "completions/mean_terminated_length": 570.4827270507812,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.4074074074074074,
      "grad_norm": 6.367082404169811,
      "kl": NaN,
      "learning_rate": 4.966463963888775e-07,
      "loss": 0.2158,
      "num_tokens": 3125992.0,
      "reward": 0.0,
      "reward_std": 0.23708997666835785,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 132
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 481.8125,
      "completions/mean_terminated_length": 464.32257080078125,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "epoch": 0.4104938271604938,
      "grad_norm": 8.869594271037249,
      "kl": NaN,
      "learning_rate": 4.965642914335025e-07,
      "loss": -0.0809,
      "num_tokens": 3147466.0,
      "reward": 0.0,
      "reward_std": 0.26418328285217285,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 133
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 600.34375,
      "completions/mean_terminated_length": 556.5172119140625,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 0.41358024691358025,
      "grad_norm": 5.855795974935996,
      "kl": NaN,
      "learning_rate": 4.964812004770013e-07,
      "loss": -0.0634,
      "num_tokens": 3172973.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.26164084672927856,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909375190735,
      "step": 134
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 834.0,
      "completions/mean_length": 659.1875,
      "completions/mean_terminated_length": 537.5833740234375,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.4166666666666667,
      "grad_norm": 5.513208877965179,
      "kl": NaN,
      "learning_rate": 4.963971238516519e-07,
      "loss": -0.0041,
      "num_tokens": 3200571.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18096524477005005,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 135
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 804.0,
      "completions/mean_length": 640.4375,
      "completions/mean_terminated_length": 533.0399780273438,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 0.41975308641975306,
      "grad_norm": 8.118947700231773,
      "kl": NaN,
      "learning_rate": 4.963120618936732e-07,
      "loss": 0.113,
      "num_tokens": 3227781.0,
      "reward": 0.0,
      "reward_std": 0.1903054118156433,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 136
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 783.0,
      "completions/mean_length": 616.59375,
      "completions/mean_terminated_length": 589.433349609375,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.4228395061728395,
      "grad_norm": 7.065393529232521,
      "kl": NaN,
      "learning_rate": 4.962260149432247e-07,
      "loss": 0.0773,
      "num_tokens": 3254508.0,
      "reward": 0.0,
      "reward_std": 0.2295312136411667,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096889972687,
      "step": 137
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 583.96875,
      "completions/mean_terminated_length": 538.4483032226562,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 0.42592592592592593,
      "grad_norm": 4.457811645650375,
      "kl": NaN,
      "learning_rate": 4.96138983344405e-07,
      "loss": -0.2813,
      "num_tokens": 3279387.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18175974488258362,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 138
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 744.0,
      "completions/mean_length": 556.15625,
      "completions/mean_terminated_length": 489.3214416503906,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.42901234567901236,
      "grad_norm": 6.333275510929497,
      "kl": NaN,
      "learning_rate": 4.9605096744525e-07,
      "loss": -0.2476,
      "num_tokens": 3303340.0,
      "reward": -5.122274160385132e-09,
      "reward_std": 0.1977911740541458,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 139
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 531.65625,
      "completions/mean_terminated_length": 515.774169921875,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.43209876543209874,
      "grad_norm": 5.0422237424646745,
      "kl": NaN,
      "learning_rate": 4.95961967597732e-07,
      "loss": -0.1874,
      "num_tokens": 3326357.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.15961746871471405,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 140
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 810.0,
      "completions/mean_length": 600.25,
      "completions/mean_terminated_length": 521.7777709960938,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 0.4351851851851852,
      "grad_norm": 6.722046369710051,
      "kl": NaN,
      "learning_rate": 4.958719841577579e-07,
      "loss": -0.1097,
      "num_tokens": 3351869.0,
      "reward": 0.0,
      "reward_std": 0.24473561346530914,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 141
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 604.03125,
      "completions/mean_terminated_length": 544.0357666015625,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.4382716049382716,
      "grad_norm": 8.091365730794006,
      "kl": NaN,
      "learning_rate": 4.957810174851679e-07,
      "loss": -0.1557,
      "num_tokens": 3377718.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.2584119439125061,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 142
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 593.96875,
      "completions/mean_terminated_length": 580.0967407226562,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 0.44135802469135804,
      "grad_norm": 5.928453859642679,
      "kl": NaN,
      "learning_rate": 4.956890679437345e-07,
      "loss": -0.1874,
      "num_tokens": 3403101.0,
      "reward": 0.0,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 143
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 611.71875,
      "completions/mean_terminated_length": 552.8214721679688,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 0.4444444444444444,
      "grad_norm": 4.435751443104133,
      "kl": NaN,
      "learning_rate": 4.955961359011601e-07,
      "loss": 0.3901,
      "num_tokens": 3428984.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2163362205028534,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 144
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 639.15625,
      "completions/mean_terminated_length": 567.888916015625,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.44753086419753085,
      "grad_norm": 4.566569677013355,
      "kl": NaN,
      "learning_rate": 4.955022217290766e-07,
      "loss": -0.0864,
      "num_tokens": 3455297.0,
      "reward": 0.0,
      "reward_std": 0.1600709855556488,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 145
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 681.9375,
      "completions/mean_terminated_length": 586.1599731445312,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.4506172839506173,
      "grad_norm": 4.42410325272051,
      "kl": NaN,
      "learning_rate": 4.954073258030431e-07,
      "loss": -0.408,
      "num_tokens": 3483491.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.11825646460056305,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 146
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 729.96875,
      "completions/mean_terminated_length": 596.3181762695312,
      "completions/min_length": 212.0,
      "completions/min_terminated_length": 212.0,
      "epoch": 0.4537037037037037,
      "grad_norm": 7.009213791039503,
      "kl": NaN,
      "learning_rate": 4.953114485025446e-07,
      "loss": -0.116,
      "num_tokens": 3513938.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1721172034740448,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 147
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 763.8125,
      "completions/mean_terminated_length": 662.0,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.4567901234567901,
      "grad_norm": 3.751126006916233,
      "kl": NaN,
      "learning_rate": 4.95214590210991e-07,
      "loss": 0.0975,
      "num_tokens": 3545116.0,
      "reward": 0.0,
      "reward_std": 0.1716199815273285,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 148
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 620.1875,
      "completions/mean_terminated_length": 607.1612548828125,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.45987654320987653,
      "grad_norm": 5.421698468082972,
      "kl": NaN,
      "learning_rate": 4.951167513157147e-07,
      "loss": -0.5259,
      "num_tokens": 3571158.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1502964347600937,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 149
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 604.59375,
      "completions/mean_terminated_length": 526.9259033203125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.46296296296296297,
      "grad_norm": 2.3897268658356694,
      "kl": NaN,
      "learning_rate": 4.950179322079697e-07,
      "loss": -0.0652,
      "num_tokens": 3597305.0,
      "reward": 0.0,
      "reward_std": 0.15569782257080078,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 150
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 764.625,
      "completions/mean_terminated_length": 609.0,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.4660493827160494,
      "grad_norm": 2.1609092045248084,
      "kl": NaN,
      "learning_rate": 4.949181332829299e-07,
      "loss": -0.0077,
      "num_tokens": 3629017.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.15826013684272766,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 151
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 649.0625,
      "completions/mean_terminated_length": 544.0800170898438,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.4691358024691358,
      "grad_norm": 4.2898623150959425,
      "kl": NaN,
      "learning_rate": 4.948173549396873e-07,
      "loss": 0.0112,
      "num_tokens": 3656203.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.1922873556613922,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 152
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 621.71875,
      "completions/mean_terminated_length": 547.2222290039062,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.4722222222222222,
      "grad_norm": 4.471696890343921,
      "kl": NaN,
      "learning_rate": 4.947155975812506e-07,
      "loss": -0.2802,
      "num_tokens": 3682434.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.12737129628658295,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 153
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 770.875,
      "completions/mean_terminated_length": 655.8181762695312,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 0.47530864197530864,
      "grad_norm": 2.908866923221855,
      "kl": NaN,
      "learning_rate": 4.946128616145436e-07,
      "loss": -0.015,
      "num_tokens": 3713682.0,
      "reward": 6.51925802230835e-09,
      "reward_std": 0.26219430565834045,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.43994131684303284,
      "step": 154
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 695.84375,
      "completions/mean_terminated_length": 648.9642944335938,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.4783950617283951,
      "grad_norm": 5.082271214668638,
      "kl": NaN,
      "learning_rate": 4.945091474504037e-07,
      "loss": 0.0762,
      "num_tokens": 3742373.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.35248899459838867,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -5.587935447692871e-09,
      "rewards/logprob_reward/std": 0.5679618120193481,
      "step": 155
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 713.6875,
      "completions/mean_terminated_length": 642.0769653320312,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.48148148148148145,
      "grad_norm": 5.098516528190889,
      "kl": NaN,
      "learning_rate": 4.944044555035793e-07,
      "loss": 0.0775,
      "num_tokens": 3771843.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.24149447679519653,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 156
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 636.03125,
      "completions/mean_terminated_length": 595.8965454101562,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.4845679012345679,
      "grad_norm": 15.07093103042608,
      "kl": NaN,
      "learning_rate": 4.9429878619273e-07,
      "loss": -0.4165,
      "num_tokens": 3798908.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.25131285190582275,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 157
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 844.0,
      "completions/mean_length": 594.3125,
      "completions/mean_terminated_length": 514.74072265625,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 0.4876543209876543,
      "grad_norm": 5.683678798871177,
      "kl": NaN,
      "learning_rate": 4.941921399404232e-07,
      "loss": -0.6527,
      "num_tokens": 3824110.0,
      "reward": 2.3283064365386963e-09,
      "reward_std": 0.18798649311065674,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 158
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 735.59375,
      "completions/mean_terminated_length": 584.5238037109375,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 0.49074074074074076,
      "grad_norm": 3.0657245224698473,
      "kl": NaN,
      "learning_rate": 4.940845171731329e-07,
      "loss": -0.1424,
      "num_tokens": 3854177.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.19326482713222504,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 159
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 698.34375,
      "completions/mean_terminated_length": 651.8214721679688,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 0.49382716049382713,
      "grad_norm": 6.151965793074086,
      "kl": NaN,
      "learning_rate": 4.939759183212388e-07,
      "loss": -0.0262,
      "num_tokens": 3883380.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.3883397579193115,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.5388159155845642,
      "step": 160
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 638.625,
      "completions/mean_terminated_length": 598.7586059570312,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.49691358024691357,
      "grad_norm": 2.3541404509512844,
      "kl": NaN,
      "learning_rate": 4.938663438190232e-07,
      "loss": -0.0192,
      "num_tokens": 3910280.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.26380684971809387,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 161
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 661.8125,
      "completions/mean_terminated_length": 610.0714721679688,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.5,
      "grad_norm": 4.8024894565277005,
      "kl": NaN,
      "learning_rate": 4.937557941046705e-07,
      "loss": -0.1263,
      "num_tokens": 3938054.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.2903626561164856,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 162
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 634.71875,
      "completions/mean_terminated_length": 622.1612548828125,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 0.5030864197530864,
      "grad_norm": 5.909263865787711,
      "kl": NaN,
      "learning_rate": 4.936442696202648e-07,
      "loss": -0.4076,
      "num_tokens": 3965021.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.20059458911418915,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 163
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 648.9375,
      "completions/mean_terminated_length": 543.9199829101562,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.5061728395061729,
      "grad_norm": 5.1300141105548915,
      "kl": NaN,
      "learning_rate": 4.935317708117881e-07,
      "loss": -0.3341,
      "num_tokens": 3992259.0,
      "reward": 0.0,
      "reward_std": 0.26898545026779175,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 164
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 659.3125,
      "completions/mean_terminated_length": 607.2142944335938,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.5092592592592593,
      "grad_norm": 5.913967314016694,
      "kl": NaN,
      "learning_rate": 4.934182981291187e-07,
      "loss": 0.4965,
      "num_tokens": 4020249.0,
      "reward": -9.313225746154785e-09,
      "reward_std": 0.16201332211494446,
      "rewards/format_reward_func/mean": -3.725290298461914e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -2.5029294192790985e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 165
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 730.6875,
      "completions/mean_terminated_length": 597.3636474609375,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.5123456790123457,
      "grad_norm": 5.898192428301864,
      "kl": NaN,
      "learning_rate": 4.933038520260299e-07,
      "loss": 0.0776,
      "num_tokens": 4050363.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.317568302154541,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 693.90625,
      "completions/mean_terminated_length": 601.47998046875,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 0.5154320987654321,
      "grad_norm": 7.045947141689735,
      "kl": 0.061279296875,
      "learning_rate": 4.931884329601869e-07,
      "loss": -0.6902,
      "num_tokens": 4078984.0,
      "reward": -1.4901161193847656e-08,
      "reward_std": 0.3645022511482239,
      "rewards/format_reward_func/mean": -2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5679618716239929,
      "step": 167
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 696.03125,
      "completions/mean_terminated_length": 604.2000122070312,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 0.5185185185185185,
      "grad_norm": 4.299896977851904,
      "kl": NaN,
      "learning_rate": 4.930720413931463e-07,
      "loss": -0.2749,
      "num_tokens": 4107801.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.1616957187652588,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 168
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 836.0,
      "completions/mean_length": 709.84375,
      "completions/mean_terminated_length": 567.0454711914062,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 0.5216049382716049,
      "grad_norm": 7.697107711394947,
      "kl": NaN,
      "learning_rate": 4.929546777903534e-07,
      "loss": -0.026,
      "num_tokens": 4136728.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.3467341661453247,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 169
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 653.03125,
      "completions/mean_terminated_length": 584.3333129882812,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 0.5246913580246914,
      "grad_norm": 5.383314154215838,
      "kl": NaN,
      "learning_rate": 4.928363426211407e-07,
      "loss": 0.225,
      "num_tokens": 4163881.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.19019806385040283,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 170
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 760.8125,
      "completions/mean_terminated_length": 673.0833740234375,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.5277777777777778,
      "grad_norm": 5.183491143280448,
      "kl": NaN,
      "learning_rate": 4.927170363587262e-07,
      "loss": -0.2029,
      "num_tokens": 4194395.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1886770874261856,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 171
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 658.5625,
      "completions/mean_terminated_length": 574.2307739257812,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 0.5308641975308642,
      "grad_norm": 3.313019723031103,
      "kl": NaN,
      "learning_rate": 4.925967594802109e-07,
      "loss": -0.0884,
      "num_tokens": 4221761.0,
      "reward": -1.3969838619232178e-09,
      "reward_std": 0.20295457541942596,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 823.0,
      "completions/mean_length": 777.09375,
      "completions/mean_terminated_length": 585.0555419921875,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 0.5339506172839507,
      "grad_norm": 4.536784003490762,
      "kl": 0.0555419921875,
      "learning_rate": 4.924755124665774e-07,
      "loss": -0.1002,
      "num_tokens": 4253128.0,
      "reward": 0.0,
      "reward_std": 0.20221075415611267,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 173
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 715.96875,
      "completions/mean_terminated_length": 644.8846435546875,
      "completions/min_length": 377.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 0.5370370370370371,
      "grad_norm": 4.803885137174439,
      "kl": NaN,
      "learning_rate": 4.923532958026878e-07,
      "loss": -0.1473,
      "num_tokens": 4282771.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.26001739501953125,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 174
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 677.5,
      "completions/mean_terminated_length": 597.5385131835938,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.5401234567901234,
      "grad_norm": 2.882320477201461,
      "kl": NaN,
      "learning_rate": 4.922301099772821e-07,
      "loss": -0.315,
      "num_tokens": 4311107.0,
      "reward": 0.0,
      "reward_std": 0.2995503544807434,
      "rewards/format_reward_func/mean": -2.60770320892334e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5388159155845642,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 735.71875,
      "completions/mean_terminated_length": 622.9130249023438,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 0.5432098765432098,
      "grad_norm": 3.759131324924152,
      "kl": 0.05584716796875,
      "learning_rate": 4.921059554829753e-07,
      "loss": -0.5024,
      "num_tokens": 4341462.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.2578310966491699,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 176
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 662.0,
      "completions/mean_terminated_length": 560.6400146484375,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.5462962962962963,
      "grad_norm": 4.695136669827626,
      "kl": NaN,
      "learning_rate": 4.91980832816257e-07,
      "loss": -0.0648,
      "num_tokens": 4369162.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.1863657832145691,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 177
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 809.21875,
      "completions/mean_terminated_length": 662.26318359375,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 0.5493827160493827,
      "grad_norm": 2.106420448094769,
      "kl": NaN,
      "learning_rate": 4.918547424774873e-07,
      "loss": 0.0886,
      "num_tokens": 4402133.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.17791201174259186,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 178
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 752.34375,
      "completions/mean_terminated_length": 610.047607421875,
      "completions/min_length": 253.0,
      "completions/min_terminated_length": 253.0,
      "epoch": 0.5524691358024691,
      "grad_norm": 3.4961102804099884,
      "kl": NaN,
      "learning_rate": 4.917276849708972e-07,
      "loss": 0.1677,
      "num_tokens": 4432724.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.2326449453830719,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 727.1875,
      "completions/mean_terminated_length": 644.0799560546875,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 0.5555555555555556,
      "grad_norm": 4.2019127086886785,
      "kl": 0.061187744140625,
      "learning_rate": 4.915996608045842e-07,
      "loss": -0.1858,
      "num_tokens": 4462386.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.3091117739677429,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 180
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 823.0,
      "completions/mean_length": 654.21875,
      "completions/mean_terminated_length": 568.8846435546875,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 0.558641975308642,
      "grad_norm": 3.469894708774101,
      "kl": NaN,
      "learning_rate": 4.914706704905125e-07,
      "loss": -0.208,
      "num_tokens": 4489217.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.19741347432136536,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 181
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 747.1875,
      "completions/mean_terminated_length": 654.9166870117188,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 0.5617283950617284,
      "grad_norm": 2.43344670166162,
      "kl": NaN,
      "learning_rate": 4.913407145445093e-07,
      "loss": -0.2519,
      "num_tokens": 4519575.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.2075708508491516,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 182
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 768.5625,
      "completions/mean_terminated_length": 683.4166870117188,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.5648148148148148,
      "grad_norm": 3.416727072334397,
      "kl": NaN,
      "learning_rate": 4.912097934862632e-07,
      "loss": -0.3776,
      "num_tokens": 4550805.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.283363938331604,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 183
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 786.34375,
      "completions/mean_terminated_length": 678.3181762695312,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.5679012345679012,
      "grad_norm": 2.449164100227983,
      "kl": NaN,
      "learning_rate": 4.910779078393228e-07,
      "loss": -0.1256,
      "num_tokens": 4582372.0,
      "reward": 0.0,
      "reward_std": 0.15146484971046448,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 184
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 737.15625,
      "completions/mean_terminated_length": 606.7727661132812,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "epoch": 0.5709876543209876,
      "grad_norm": 4.024706935369811,
      "kl": NaN,
      "learning_rate": 4.909450581310935e-07,
      "loss": -0.0544,
      "num_tokens": 4612405.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.262442946434021,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 185
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 707.0,
      "completions/mean_terminated_length": 633.84619140625,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 0.5740740740740741,
      "grad_norm": 2.6405622879977413,
      "kl": NaN,
      "learning_rate": 4.908112448928363e-07,
      "loss": 0.0276,
      "num_tokens": 4641465.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.23802822828292847,
      "rewards/format_reward_func/mean": -3.725290298461914e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 186
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 734.375,
      "completions/mean_terminated_length": 602.727294921875,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.5771604938271605,
      "grad_norm": 3.206208208105168,
      "kl": NaN,
      "learning_rate": 4.906764686596651e-07,
      "loss": -0.133,
      "num_tokens": 4670741.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.20190681517124176,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 187
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 690.375,
      "completions/mean_terminated_length": 596.9599609375,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.5802469135802469,
      "grad_norm": 4.086080046056693,
      "kl": NaN,
      "learning_rate": 4.90540729970545e-07,
      "loss": 0.2032,
      "num_tokens": 4699437.0,
      "reward": 0.0,
      "reward_std": 0.1954423040151596,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 188
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 715.8125,
      "completions/mean_terminated_length": 644.6923217773438,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.5833333333333334,
      "grad_norm": 2.8338398303145413,
      "kl": NaN,
      "learning_rate": 4.904040293682897e-07,
      "loss": -0.1297,
      "num_tokens": 4728543.0,
      "reward": -9.313225746154785e-09,
      "reward_std": 0.20422792434692383,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 790.78125,
      "completions/mean_terminated_length": 650.8500366210938,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 0.5864197530864198,
      "grad_norm": 2.8438341579660102,
      "kl": 0.066864013671875,
      "learning_rate": 4.902663673995597e-07,
      "loss": 0.1729,
      "num_tokens": 4760248.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.23384305834770203,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 831.65625,
      "completions/mean_terminated_length": 700.0526123046875,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 0.5895061728395061,
      "grad_norm": 2.780619605697074,
      "kl": 0.0731201171875,
      "learning_rate": 4.9012774461486e-07,
      "loss": -0.089,
      "num_tokens": 4793725.0,
      "reward": 0.0,
      "reward_std": 0.28481680154800415,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 191
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 812.90625,
      "completions/mean_terminated_length": 702.3333740234375,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 0.5925925925925926,
      "grad_norm": 2.6545162579981834,
      "kl": NaN,
      "learning_rate": 4.899881615685376e-07,
      "loss": 0.0325,
      "num_tokens": 4826738.0,
      "reward": -1.4901161193847656e-08,
      "reward_std": 0.29407086968421936,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 192
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 806.0625,
      "completions/mean_terminated_length": 691.90478515625,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.595679012345679,
      "grad_norm": 4.497736207531074,
      "kl": NaN,
      "learning_rate": 4.898476188187798e-07,
      "loss": -0.0457,
      "num_tokens": 4858916.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18742044270038605,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 193
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 722.96875,
      "completions/mean_terminated_length": 653.5,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.5987654320987654,
      "grad_norm": 2.304468478057239,
      "kl": NaN,
      "learning_rate": 4.897061169276118e-07,
      "loss": -0.2346,
      "num_tokens": 4888071.0,
      "reward": -8.381903171539307e-09,
      "reward_std": 0.14934472739696503,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 194
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 780.4375,
      "completions/mean_terminated_length": 652.857177734375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.6018518518518519,
      "grad_norm": 1.663862730374436,
      "kl": NaN,
      "learning_rate": 4.895636564608942e-07,
      "loss": -0.0296,
      "num_tokens": 4919649.0,
      "reward": -6.51925802230835e-09,
      "reward_std": 0.15778718888759613,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 733.4375,
      "completions/mean_terminated_length": 652.0799560546875,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.6049382716049383,
      "grad_norm": 6.552853168590596,
      "kl": 0.091400146484375,
      "learning_rate": 4.894202379883206e-07,
      "loss": 0.0893,
      "num_tokens": 4949807.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19026055932044983,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 196
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 744.9375,
      "completions/mean_terminated_length": 693.25927734375,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.6080246913580247,
      "grad_norm": 2.0717863669209438,
      "kl": NaN,
      "learning_rate": 4.892758620834165e-07,
      "loss": -0.0606,
      "num_tokens": 4980221.0,
      "reward": -1.1175870895385742e-08,
      "reward_std": 0.2551858425140381,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909375190735,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 732.4375,
      "completions/mean_terminated_length": 650.7999877929688,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 0.6111111111111112,
      "grad_norm": 3.4708064378122376,
      "kl": 0.0716552734375,
      "learning_rate": 4.891305293235351e-07,
      "loss": -0.1915,
      "num_tokens": 5009911.0,
      "reward": -1.4901161193847656e-08,
      "reward_std": 0.3596891760826111,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 198
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 788.65625,
      "completions/mean_terminated_length": 665.3809814453125,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.6141975308641975,
      "grad_norm": 1.6743537613630881,
      "kl": NaN,
      "learning_rate": 4.889842402898569e-07,
      "loss": -0.0209,
      "num_tokens": 5042052.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.26783668994903564,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 199
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 857.5625,
      "completions/mean_terminated_length": 757.7000122070312,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.6172839506172839,
      "grad_norm": 1.7681695511576603,
      "kl": NaN,
      "learning_rate": 4.888369955673858e-07,
      "loss": -0.0328,
      "num_tokens": 5076666.0,
      "reward": 5.587935447692871e-09,
      "reward_std": 0.25736480951309204,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 768.71875,
      "completions/mean_terminated_length": 652.6818237304688,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 0.6203703703703703,
      "grad_norm": 2.423013369291697,
      "kl": 0.089080810546875,
      "learning_rate": 4.88688795744948e-07,
      "loss": -0.3225,
      "num_tokens": 5108017.0,
      "reward": 5.587935447692871e-09,
      "reward_std": 0.2015441507101059,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 753.125,
      "completions/mean_terminated_length": 647.1304321289062,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 0.6234567901234568,
      "grad_norm": 3.878827640491604,
      "kl": 0.09527587890625,
      "learning_rate": 4.885396414151888e-07,
      "loss": 0.1059,
      "num_tokens": 5138557.0,
      "reward": 0.0,
      "reward_std": 0.26025640964508057,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 202
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 633.28125,
      "completions/mean_terminated_length": 577.4642944335938,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.6265432098765432,
      "grad_norm": 5.296543435346333,
      "kl": NaN,
      "learning_rate": 4.883895331745707e-07,
      "loss": -0.4315,
      "num_tokens": 5164726.0,
      "reward": -1.1175870895385742e-08,
      "reward_std": 0.2554783821105957,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 739.15625,
      "completions/mean_terminated_length": 609.6818237304688,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 0.6296296296296297,
      "grad_norm": 4.665205117691723,
      "kl": 0.0711669921875,
      "learning_rate": 4.882384716233709e-07,
      "loss": -0.5091,
      "num_tokens": 5194635.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2635560929775238,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 204
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 767.21875,
      "completions/mean_terminated_length": 650.5,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.6327160493827161,
      "grad_norm": 2.9408032043750487,
      "kl": NaN,
      "learning_rate": 4.880864573656785e-07,
      "loss": -0.2056,
      "num_tokens": 5226358.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.29013127088546753,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 205
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 739.0625,
      "completions/mean_terminated_length": 609.5454711914062,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 0.6358024691358025,
      "grad_norm": 3.213343905069938,
      "kl": NaN,
      "learning_rate": 4.879334910093926e-07,
      "loss": -0.264,
      "num_tokens": 5256600.0,
      "reward": 0.0,
      "reward_std": 0.2007269412279129,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 814.625,
      "completions/mean_terminated_length": 605.25,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 0.6388888888888888,
      "grad_norm": 3.6603473739931105,
      "kl": 0.078094482421875,
      "learning_rate": 4.877795731662202e-07,
      "loss": -0.3169,
      "num_tokens": 5289044.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.20887130498886108,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 207
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 810.5625,
      "completions/mean_terminated_length": 682.5,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 0.6419753086419753,
      "grad_norm": 3.005784406698742,
      "kl": NaN,
      "learning_rate": 4.876247044516724e-07,
      "loss": -0.1685,
      "num_tokens": 5321606.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.17495723068714142,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 812.15625,
      "completions/mean_terminated_length": 701.1904907226562,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 0.6450617283950617,
      "grad_norm": 3.055435065655736,
      "kl": 0.0987548828125,
      "learning_rate": 4.874688854850635e-07,
      "loss": -0.2587,
      "num_tokens": 5354467.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.2891131043434143,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 209
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 882.0,
      "completions/mean_length": 678.71875,
      "completions/mean_terminated_length": 563.625,
      "completions/min_length": 278.0,
      "completions/min_terminated_length": 278.0,
      "epoch": 0.6481481481481481,
      "grad_norm": 2.8793279918757797,
      "kl": NaN,
      "learning_rate": 4.873121168895075e-07,
      "loss": -0.1438,
      "num_tokens": 5382422.0,
      "reward": 1.4901161193847656e-08,
      "reward_std": 0.3393701910972595,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 757.0,
      "completions/mean_terminated_length": 668.0,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 0.6512345679012346,
      "grad_norm": 2.533179208177293,
      "kl": 0.080718994140625,
      "learning_rate": 4.87154399291916e-07,
      "loss": -0.192,
      "num_tokens": 5413010.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.1891198754310608,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 211
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 829.34375,
      "completions/mean_terminated_length": 677.9444580078125,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.654320987654321,
      "grad_norm": 2.978035054435234,
      "kl": NaN,
      "learning_rate": 4.869957333229955e-07,
      "loss": -0.2636,
      "num_tokens": 5445613.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.23661711812019348,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096889972687,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 678.75,
      "completions/mean_terminated_length": 614.8148193359375,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.6574074074074074,
      "grad_norm": 3.2081156207665313,
      "kl": 0.109619140625,
      "learning_rate": 4.868361196172453e-07,
      "loss": 0.1124,
      "num_tokens": 5473581.0,
      "reward": 0.0,
      "reward_std": 0.2787058353424072,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 213
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 886.0,
      "completions/mean_length": 729.125,
      "completions/mean_terminated_length": 630.8333740234375,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.6604938271604939,
      "grad_norm": 2.21977273685207,
      "kl": NaN,
      "learning_rate": 4.866755588129542e-07,
      "loss": -0.1336,
      "num_tokens": 5503061.0,
      "reward": -4.6566128730773926e-09,
      "reward_std": 0.19088414311408997,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 214
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 812.8125,
      "completions/mean_terminated_length": 626.4705810546875,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 0.6635802469135802,
      "grad_norm": 3.396468325787721,
      "kl": NaN,
      "learning_rate": 4.86514051552199e-07,
      "loss": -0.303,
      "num_tokens": 5535647.0,
      "reward": 0.0,
      "reward_std": 0.19001758098602295,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 787.09375,
      "completions/mean_terminated_length": 644.9500122070312,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 0.6666666666666666,
      "grad_norm": 3.70603151927325,
      "kl": 0.0989990234375,
      "learning_rate": 4.863515984808408e-07,
      "loss": 0.0477,
      "num_tokens": 5567206.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.33198004961013794,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 216
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 713.125,
      "completions/mean_terminated_length": 609.5,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.6697530864197531,
      "grad_norm": 4.981690107695676,
      "kl": NaN,
      "learning_rate": 4.861882002485234e-07,
      "loss": -0.5205,
      "num_tokens": 5596386.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.20860743522644043,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 741.5,
      "completions/mean_terminated_length": 593.5238037109375,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.6728395061728395,
      "grad_norm": 2.9858354697398206,
      "kl": 0.1287841796875,
      "learning_rate": 4.860238575086699e-07,
      "loss": -0.2922,
      "num_tokens": 5627022.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1880928874015808,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 218
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 738.4375,
      "completions/mean_terminated_length": 626.6956787109375,
      "completions/min_length": 245.0,
      "completions/min_terminated_length": 245.0,
      "epoch": 0.6759259259259259,
      "grad_norm": 2.2240921510399967,
      "kl": NaN,
      "learning_rate": 4.858585709184806e-07,
      "loss": -0.0813,
      "num_tokens": 5656968.0,
      "reward": 0.0,
      "reward_std": 0.2712445855140686,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 790.90625,
      "completions/mean_terminated_length": 631.4210815429688,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 0.6790123456790124,
      "grad_norm": 1.3628167364247696,
      "kl": 0.10546875,
      "learning_rate": 4.856923411389302e-07,
      "loss": -0.002,
      "num_tokens": 5688765.0,
      "reward": 0.0,
      "reward_std": 0.1343029886484146,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 777.34375,
      "completions/mean_terminated_length": 695.125,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.6820987654320988,
      "grad_norm": 2.426558623160803,
      "kl": 0.10418701171875,
      "learning_rate": 4.855251688347653e-07,
      "loss": -0.1912,
      "num_tokens": 5720220.0,
      "reward": 0.0,
      "reward_std": 0.25110799074172974,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 221
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 855.0625,
      "completions/mean_terminated_length": 739.4736938476562,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.6851851851851852,
      "grad_norm": 2.133269936151635,
      "kl": NaN,
      "learning_rate": 4.853570546745014e-07,
      "loss": -0.0728,
      "num_tokens": 5754410.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.21861866116523743,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.1641532182693481e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 783.96875,
      "completions/mean_terminated_length": 639.9500122070312,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.6882716049382716,
      "grad_norm": 3.7839727813702058,
      "kl": 0.085906982421875,
      "learning_rate": 4.851879993304208e-07,
      "loss": 0.0654,
      "num_tokens": 5785749.0,
      "reward": 0.0,
      "reward_std": 0.23473702371120453,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 223
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 809.9375,
      "completions/mean_terminated_length": 681.5,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 0.691358024691358,
      "grad_norm": 1.826492925221081,
      "kl": NaN,
      "learning_rate": 4.850180034785691e-07,
      "loss": -0.1136,
      "num_tokens": 5818415.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.07151594012975693,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 224
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 878.0,
      "completions/mean_length": 780.90625,
      "completions/mean_terminated_length": 635.0499877929688,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 0.6944444444444444,
      "grad_norm": 1.9202875738575542,
      "kl": NaN,
      "learning_rate": 4.848470677987532e-07,
      "loss": -0.113,
      "num_tokens": 5850032.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.22210678458213806,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 225
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 805.53125,
      "completions/mean_terminated_length": 691.0952758789062,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.6975308641975309,
      "grad_norm": 2.017837093796174,
      "kl": NaN,
      "learning_rate": 4.846751929745383e-07,
      "loss": -0.0029,
      "num_tokens": 5882289.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.19541588425636292,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 861.0,
      "completions/mean_length": 728.71875,
      "completions/mean_terminated_length": 574.047607421875,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.7006172839506173,
      "grad_norm": 3.1385720720247985,
      "kl": 0.1187744140625,
      "learning_rate": 4.845023796932454e-07,
      "loss": -0.204,
      "num_tokens": 5911532.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.22258394956588745,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 737.125,
      "completions/mean_terminated_length": 624.8695678710938,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 0.7037037037037037,
      "grad_norm": 2.0482393741551417,
      "kl": 0.0987548828125,
      "learning_rate": 4.84328628645948e-07,
      "loss": 0.0905,
      "num_tokens": 5941608.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.1891579031944275,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -2.7939677238464355e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 228
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 788.0625,
      "completions/mean_terminated_length": 604.5555419921875,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.7067901234567902,
      "grad_norm": 3.4958822091772714,
      "kl": NaN,
      "learning_rate": 4.841539405274698e-07,
      "loss": -0.1844,
      "num_tokens": 5973686.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.24194923043251038,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 775.46875,
      "completions/mean_terminated_length": 645.2857055664062,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.7098765432098766,
      "grad_norm": 3.1229631584357502,
      "kl": 0.095977783203125,
      "learning_rate": 4.839783160363821e-07,
      "loss": -0.193,
      "num_tokens": 6005525.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2672354578971863,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 731.71875,
      "completions/mean_terminated_length": 617.3478393554688,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 0.7129629629629629,
      "grad_norm": 2.2642799708066583,
      "kl": 0.105316162109375,
      "learning_rate": 4.838017558750004e-07,
      "loss": -0.293,
      "num_tokens": 6035812.0,
      "reward": 0.0,
      "reward_std": 0.173539400100708,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 231
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 795.53125,
      "completions/mean_terminated_length": 658.4500122070312,
      "completions/min_length": 284.0,
      "completions/min_terminated_length": 284.0,
      "epoch": 0.7160493827160493,
      "grad_norm": 5.1671418250930055,
      "kl": NaN,
      "learning_rate": 4.836242607493819e-07,
      "loss": 0.0455,
      "num_tokens": 6068105.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.34079915285110474,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4751909375190735,
      "step": 232
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 723.0625,
      "completions/mean_terminated_length": 622.75,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.7191358024691358,
      "grad_norm": 2.0798496267906352,
      "kl": NaN,
      "learning_rate": 4.834458313693228e-07,
      "loss": 0.0239,
      "num_tokens": 6097931.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.2358597218990326,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 696.5,
      "completions/mean_terminated_length": 604.7999877929688,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.7222222222222222,
      "grad_norm": 4.333848724052473,
      "kl": 0.11224365234375,
      "learning_rate": 4.832664684483555e-07,
      "loss": -0.1502,
      "num_tokens": 6126531.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.24924881756305695,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 234
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 783.25,
      "completions/mean_terminated_length": 570.8235473632812,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 0.7253086419753086,
      "grad_norm": 1.5885116011981246,
      "kl": NaN,
      "learning_rate": 4.830861727037453e-07,
      "loss": 0.0281,
      "num_tokens": 6158191.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.07657893747091293,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 836.6875,
      "completions/mean_terminated_length": 649.375,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 0.7283950617283951,
      "grad_norm": 4.699831311502976,
      "kl": 0.123870849609375,
      "learning_rate": 4.82904944856488e-07,
      "loss": -0.4437,
      "num_tokens": 6191801.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.12057675421237946,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 706.0625,
      "completions/mean_terminated_length": 617.0399780273438,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.7314814814814815,
      "grad_norm": 3.296723607037876,
      "kl": 0.12939453125,
      "learning_rate": 4.827227856313066e-07,
      "loss": -0.297,
      "num_tokens": 6220767.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.27311262488365173,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 237
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 746.34375,
      "completions/mean_terminated_length": 637.6956787109375,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.7345679012345679,
      "grad_norm": 3.3829561076219923,
      "kl": NaN,
      "learning_rate": 4.825396957566491e-07,
      "loss": 0.1273,
      "num_tokens": 6251782.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18932479619979858,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 649.71875,
      "completions/mean_terminated_length": 580.4074096679688,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.7376543209876543,
      "grad_norm": 4.971394904707194,
      "kl": 0.12762451171875,
      "learning_rate": 4.823556759646847e-07,
      "loss": -0.1317,
      "num_tokens": 6278893.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.2861737608909607,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 817.0,
      "completions/mean_length": 731.9375,
      "completions/mean_terminated_length": 617.6521606445312,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.7407407407407407,
      "grad_norm": 3.9598679602177236,
      "kl": 0.1295166015625,
      "learning_rate": 4.821707269913016e-07,
      "loss": -0.3395,
      "num_tokens": 6309431.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.25012826919555664,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 677.46875,
      "completions/mean_terminated_length": 561.9583740234375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.7438271604938271,
      "grad_norm": 3.493835067695898,
      "kl": 0.1341552734375,
      "learning_rate": 4.819848495761037e-07,
      "loss": -0.2769,
      "num_tokens": 6337758.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.21241632103919983,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 241
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 688.375,
      "completions/mean_terminated_length": 576.5,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "epoch": 0.7469135802469136,
      "grad_norm": 4.054686799800771,
      "kl": NaN,
      "learning_rate": 4.817980444624076e-07,
      "loss": -0.2903,
      "num_tokens": 6366362.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15422162413597107,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 748.375,
      "completions/mean_terminated_length": 656.5,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 0.75,
      "grad_norm": 3.2645994838077947,
      "kl": 0.09912109375,
      "learning_rate": 4.816103123972395e-07,
      "loss": 0.1701,
      "num_tokens": 6396726.0,
      "reward": 6.51925802230835e-09,
      "reward_std": 0.3210039734840393,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 758.6875,
      "completions/mean_terminated_length": 654.8695678710938,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 0.7530864197530864,
      "grad_norm": 3.4538214639853657,
      "kl": 0.13458251953125,
      "learning_rate": 4.814216541313329e-07,
      "loss": 0.0184,
      "num_tokens": 6427664.0,
      "reward": -4.6566128730773926e-09,
      "reward_std": 0.2180139422416687,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -2.0954757928848267e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 727.5,
      "completions/mean_terminated_length": 628.6666870117188,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.7561728395061729,
      "grad_norm": 3.294021912055393,
      "kl": 0.12322998046875,
      "learning_rate": 4.812320704191252e-07,
      "loss": 0.1031,
      "num_tokens": 6457412.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14231424033641815,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 245
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 749.0625,
      "completions/mean_terminated_length": 624.0909423828125,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.7592592592592593,
      "grad_norm": 2.315221828899277,
      "kl": NaN,
      "learning_rate": 4.81041562018754e-07,
      "loss": -0.2133,
      "num_tokens": 6487486.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.21590662002563477,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 246
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 736.6875,
      "completions/mean_terminated_length": 624.2608642578125,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 0.7623456790123457,
      "grad_norm": 3.3242674540332513,
      "kl": NaN,
      "learning_rate": 4.808501296920552e-07,
      "loss": -0.1676,
      "num_tokens": 6517144.0,
      "reward": 0.0,
      "reward_std": 0.20270907878875732,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 880.0,
      "completions/mean_length": 717.75,
      "completions/mean_terminated_length": 647.0769653320312,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 0.7654320987654321,
      "grad_norm": 3.1084229164813078,
      "kl": 0.1446533203125,
      "learning_rate": 4.806577742045593e-07,
      "loss": -0.2875,
      "num_tokens": 6546780.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.16301563382148743,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 248
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 782.34375,
      "completions/mean_terminated_length": 672.5,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 0.7685185185185185,
      "grad_norm": 2.2457243077069826,
      "kl": NaN,
      "learning_rate": 4.804644963254887e-07,
      "loss": -0.1534,
      "num_tokens": 6578251.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1873370110988617,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.3969838619232178e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 249
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 740.21875,
      "completions/mean_terminated_length": 629.1739501953125,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.7716049382716049,
      "grad_norm": 1.807821247523498,
      "kl": NaN,
      "learning_rate": 4.80270296827754e-07,
      "loss": -0.1799,
      "num_tokens": 6608766.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.10673221200704575,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 725.59375,
      "completions/mean_terminated_length": 642.0399780273438,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 0.7746913580246914,
      "grad_norm": 1.8298842724913364,
      "kl": 0.143798828125,
      "learning_rate": 4.800751764879516e-07,
      "loss": -0.1244,
      "num_tokens": 6638337.0,
      "reward": 0.0,
      "reward_std": 0.16211052238941193,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 740.84375,
      "completions/mean_terminated_length": 661.5599975585938,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.7777777777777778,
      "grad_norm": 2.268148157743323,
      "kl": 0.1407470703125,
      "learning_rate": 4.798791360863602e-07,
      "loss": -0.1694,
      "num_tokens": 6668616.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.21965095400810242,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 741.90625,
      "completions/mean_terminated_length": 613.6818237304688,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 0.7808641975308642,
      "grad_norm": 3.3267154691316025,
      "kl": 0.172393798828125,
      "learning_rate": 4.796821764069378e-07,
      "loss": -0.2182,
      "num_tokens": 6698989.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.12776592373847961,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 789.71875,
      "completions/mean_terminated_length": 698.0435180664062,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 0.7839506172839507,
      "grad_norm": 2.8463300517998205,
      "kl": 0.13848876953125,
      "learning_rate": 4.794842982373188e-07,
      "loss": -0.3921,
      "num_tokens": 6730656.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.30081069469451904,
      "rewards/format_reward_func/mean": 3.725290298461914e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 878.0,
      "completions/mean_length": 774.1875,
      "completions/mean_terminated_length": 660.6364135742188,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 0.7870370370370371,
      "grad_norm": 2.6787479352006054,
      "kl": 0.15087890625,
      "learning_rate": 4.7928550236881e-07,
      "loss": -0.1829,
      "num_tokens": 6761922.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19329413771629333,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 692.96875,
      "completions/mean_terminated_length": 600.2799682617188,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "epoch": 0.7901234567901234,
      "grad_norm": 2.9002315054338776,
      "kl": 0.13751220703125,
      "learning_rate": 4.790857895963888e-07,
      "loss": -0.4516,
      "num_tokens": 6789985.0,
      "reward": 9.313225746154785e-09,
      "reward_std": 0.2881210148334503,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 746.34375,
      "completions/mean_terminated_length": 600.90478515625,
      "completions/min_length": 301.0,
      "completions/min_terminated_length": 301.0,
      "epoch": 0.7932098765432098,
      "grad_norm": 4.284324758344148,
      "kl": 0.15911865234375,
      "learning_rate": 4.788851607186988e-07,
      "loss": -0.4735,
      "num_tokens": 6820148.0,
      "reward": 0.0,
      "reward_std": 0.15029644966125488,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 736.0625,
      "completions/mean_terminated_length": 640.0833740234375,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.7962962962962963,
      "grad_norm": 5.010981769482883,
      "kl": 0.14849853515625,
      "learning_rate": 4.786836165380472e-07,
      "loss": -0.6461,
      "num_tokens": 6849914.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.14930030703544617,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 852.0,
      "completions/mean_length": 726.875,
      "completions/mean_terminated_length": 591.8181762695312,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.7993827160493827,
      "grad_norm": 3.814852394046285,
      "kl": 0.13836669921875,
      "learning_rate": 4.784811578604013e-07,
      "loss": 0.3629,
      "num_tokens": 6879306.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.22981852293014526,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 866.0,
      "completions/mean_length": 708.65625,
      "completions/mean_terminated_length": 565.3181762695312,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.8024691358024691,
      "grad_norm": 6.197667422340778,
      "kl": 0.172607421875,
      "learning_rate": 4.782777854953857e-07,
      "loss": -0.2984,
      "num_tokens": 6908743.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1861894428730011,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 737.84375,
      "completions/mean_terminated_length": 657.719970703125,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "epoch": 0.8055555555555556,
      "grad_norm": 1.4501911673247054,
      "kl": 0.1632080078125,
      "learning_rate": 4.780735002562785e-07,
      "loss": -0.0344,
      "num_tokens": 6938870.0,
      "reward": 0.02812500298023224,
      "reward_std": 0.12218310683965683,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 711.5,
      "completions/mean_terminated_length": 589.2174072265625,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "epoch": 0.808641975308642,
      "grad_norm": 2.4850563544987287,
      "kl": 0.1593017578125,
      "learning_rate": 4.778683029600089e-07,
      "loss": -0.1642,
      "num_tokens": 6968186.0,
      "reward": 0.0,
      "reward_std": 0.2453797608613968,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 262
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 707.1875,
      "completions/mean_terminated_length": 634.0769653320312,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.8117283950617284,
      "grad_norm": 2.638204819765845,
      "kl": NaN,
      "learning_rate": 4.776621944271526e-07,
      "loss": -0.2079,
      "num_tokens": 6997596.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.201801598072052,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 702.75,
      "completions/mean_terminated_length": 612.7999877929688,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.8148148148148148,
      "grad_norm": 4.922844441620661,
      "kl": 0.138427734375,
      "learning_rate": 4.774551754819299e-07,
      "loss": -0.4543,
      "num_tokens": 7026360.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.22252653539180756,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 753.1875,
      "completions/mean_terminated_length": 662.9166870117188,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.8179012345679012,
      "grad_norm": 2.4659294983487468,
      "kl": 0.14898681640625,
      "learning_rate": 4.772472469522015e-07,
      "loss": -0.1614,
      "num_tokens": 7057062.0,
      "reward": 5.587935447692871e-09,
      "reward_std": 0.27855920791625977,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 788.0,
      "completions/mean_terminated_length": 664.3809814453125,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 0.8209876543209876,
      "grad_norm": 3.1817469660601505,
      "kl": 0.14874267578125,
      "learning_rate": 4.770384096694658e-07,
      "loss": -0.0496,
      "num_tokens": 7088598.0,
      "reward": 5.587935447692871e-09,
      "reward_std": 0.27550405263900757,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -2.561137080192566e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 747.21875,
      "completions/mean_terminated_length": 683.34619140625,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.8240740740740741,
      "grad_norm": 2.648277388892835,
      "kl": 0.14556884765625,
      "learning_rate": 4.7682866446885475e-07,
      "loss": -0.387,
      "num_tokens": 7119097.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18597835302352905,
      "rewards/format_reward_func/mean": -2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 812.28125,
      "completions/mean_terminated_length": 600.5625,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 0.8271604938271605,
      "grad_norm": 1.895103808274615,
      "kl": 0.16876220703125,
      "learning_rate": 4.766180121891316e-07,
      "loss": 0.0752,
      "num_tokens": 7151826.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15634265542030334,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 737.46875,
      "completions/mean_terminated_length": 657.239990234375,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 0.8302469135802469,
      "grad_norm": 2.4635694188564927,
      "kl": 0.1478271484375,
      "learning_rate": 4.7640645367268663e-07,
      "loss": -0.2284,
      "num_tokens": 7181905.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1880928874015808,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 810.8125,
      "completions/mean_terminated_length": 682.9000244140625,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 0.8333333333333334,
      "grad_norm": 1.7543640852318774,
      "kl": 0.1400146484375,
      "learning_rate": 4.761939897655343e-07,
      "loss": -0.1302,
      "num_tokens": 7214247.0,
      "reward": 0.0,
      "reward_std": 0.18571428954601288,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 820.5625,
      "completions/mean_terminated_length": 662.3333129882812,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "epoch": 0.8364197530864198,
      "grad_norm": 1.9597368769750425,
      "kl": 0.1575927734375,
      "learning_rate": 4.7598062131730943e-07,
      "loss": -0.0789,
      "num_tokens": 7246569.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.142043799161911,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 745.84375,
      "completions/mean_terminated_length": 619.4091186523438,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 0.8395061728395061,
      "grad_norm": 2.964069719910258,
      "kl": 0.16131591796875,
      "learning_rate": 4.757663491812644e-07,
      "loss": -0.4122,
      "num_tokens": 7276844.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1861894428730011,
      "rewards/format_reward_func/mean": -3.725290298461914e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 795.4375,
      "completions/mean_terminated_length": 658.2999877929688,
      "completions/min_length": 292.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 0.8425925925925926,
      "grad_norm": 1.3766206164921555,
      "kl": 0.1619873046875,
      "learning_rate": 4.755511742142652e-07,
      "loss": -0.0572,
      "num_tokens": 7308802.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.06432675570249557,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 737.1875,
      "completions/mean_terminated_length": 624.95654296875,
      "completions/min_length": 284.0,
      "completions/min_terminated_length": 284.0,
      "epoch": 0.845679012345679,
      "grad_norm": 2.0849285872933416,
      "kl": 0.14141845703125,
      "learning_rate": 4.753350972767883e-07,
      "loss": 0.0073,
      "num_tokens": 7338484.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20247438549995422,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 819.84375,
      "completions/mean_terminated_length": 712.90478515625,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 0.8487654320987654,
      "grad_norm": 1.632688373463398,
      "kl": 0.149658203125,
      "learning_rate": 4.75118119232917e-07,
      "loss": -0.1197,
      "num_tokens": 7371347.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.16597789525985718,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 798.40625,
      "completions/mean_terminated_length": 695.8636474609375,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.8518518518518519,
      "grad_norm": 1.4641771387201727,
      "kl": 0.16363525390625,
      "learning_rate": 4.749002409503382e-07,
      "loss": -0.0562,
      "num_tokens": 7403812.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.17815831303596497,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 831.71875,
      "completions/mean_terminated_length": 700.1578979492188,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 0.8549382716049383,
      "grad_norm": 1.6203607169742502,
      "kl": 0.15106201171875,
      "learning_rate": 4.7468146330033874e-07,
      "loss": -0.0896,
      "num_tokens": 7436911.0,
      "reward": 0.0,
      "reward_std": 0.20206104218959808,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 752.5,
      "completions/mean_terminated_length": 662.0,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.8580246913580247,
      "grad_norm": 1.927049495922596,
      "kl": 0.1982421875,
      "learning_rate": 4.7446178715780213e-07,
      "loss": -0.0179,
      "num_tokens": 7467319.0,
      "reward": 0.0,
      "reward_std": 0.199102520942688,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 770.75,
      "completions/mean_terminated_length": 655.6364135742188,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 0.8611111111111112,
      "grad_norm": 2.520690747868014,
      "kl": 0.17913818359375,
      "learning_rate": 4.742412134012047e-07,
      "loss": -0.1682,
      "num_tokens": 7498379.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.11317629367113113,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 760.09375,
      "completions/mean_terminated_length": 672.125,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.8641975308641975,
      "grad_norm": 1.3364082658906518,
      "kl": 0.1563720703125,
      "learning_rate": 4.740197429126125e-07,
      "loss": -0.1614,
      "num_tokens": 7529446.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.12499846518039703,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 719.59375,
      "completions/mean_terminated_length": 600.478271484375,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.8672839506172839,
      "grad_norm": 1.6900766726674665,
      "kl": 0.1710205078125,
      "learning_rate": 4.7379737657767745e-07,
      "loss": -0.1112,
      "num_tokens": 7558605.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.11206148564815521,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 787.21875,
      "completions/mean_terminated_length": 645.1500244140625,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 0.8703703703703703,
      "grad_norm": 1.5534181757884256,
      "kl": 0.1671142578125,
      "learning_rate": 4.7357411528563393e-07,
      "loss": -0.0699,
      "num_tokens": 7590464.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.11586824059486389,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 755.875,
      "completions/mean_terminated_length": 694.0,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.8734567901234568,
      "grad_norm": 1.7407050533198927,
      "kl": 0.190673828125,
      "learning_rate": 4.733499599292955e-07,
      "loss": -0.1111,
      "num_tokens": 7620896.0,
      "reward": 0.02812500298023224,
      "reward_std": 0.13413286209106445,
      "rewards/format_reward_func/mean": 3.3527612686157227e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 782.875,
      "completions/mean_terminated_length": 727.2307739257812,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.8765432098765432,
      "grad_norm": 2.3587940594191217,
      "kl": 0.202392578125,
      "learning_rate": 4.7312491140505064e-07,
      "loss": -0.1237,
      "num_tokens": 7652752.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.23611551523208618,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 765.875,
      "completions/mean_terminated_length": 648.5454711914062,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 0.8796296296296297,
      "grad_norm": 1.4397587646268084,
      "kl": 0.1761474609375,
      "learning_rate": 4.7289897061285965e-07,
      "loss": -0.0864,
      "num_tokens": 7683580.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.16567710041999817,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 755.8125,
      "completions/mean_terminated_length": 650.8695678710938,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.8827160493827161,
      "grad_norm": 1.3844343604203304,
      "kl": 0.169921875,
      "learning_rate": 4.726721384562513e-07,
      "loss": -0.0201,
      "num_tokens": 7714302.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.13184288144111633,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 804.28125,
      "completions/mean_terminated_length": 731.0416870117188,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.8858024691358025,
      "grad_norm": 1.830878149514015,
      "kl": 0.202392578125,
      "learning_rate": 4.724444158423185e-07,
      "loss": -0.1293,
      "num_tokens": 7746879.0,
      "reward": 0.0,
      "reward_std": 0.17877720296382904,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 861.96875,
      "completions/mean_terminated_length": 719.0,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 0.8888888888888888,
      "grad_norm": 1.1348754400802896,
      "kl": 0.1612548828125,
      "learning_rate": 4.722158036817154e-07,
      "loss": -0.0218,
      "num_tokens": 7780942.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.07559289038181305,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 822.03125,
      "completions/mean_terminated_length": 683.8421020507812,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 0.8919753086419753,
      "grad_norm": 1.6500618392864028,
      "kl": 0.224365234375,
      "learning_rate": 4.7198630288865304e-07,
      "loss": -0.0926,
      "num_tokens": 7814191.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.1277659386396408,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 764.90625,
      "completions/mean_terminated_length": 705.1154174804688,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.8950617283950617,
      "grad_norm": 1.8972891537926457,
      "kl": 0.188232421875,
      "learning_rate": 4.7175591438089646e-07,
      "loss": -0.2144,
      "num_tokens": 7845476.0,
      "reward": 0.0,
      "reward_std": 0.07440169155597687,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 797.625,
      "completions/mean_terminated_length": 679.047607421875,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 0.8981481481481481,
      "grad_norm": 1.7510276018240747,
      "kl": 0.20361328125,
      "learning_rate": 4.7152463907976024e-07,
      "loss": -0.0901,
      "num_tokens": 7878244.0,
      "reward": -3.259629011154175e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 750.5,
      "completions/mean_terminated_length": 643.478271484375,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.9012345679012346,
      "grad_norm": 1.9643840722039991,
      "kl": 0.2164306640625,
      "learning_rate": 4.7129247791010563e-07,
      "loss": -0.1134,
      "num_tokens": 7908864.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14452563226222992,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 786.625,
      "completions/mean_terminated_length": 707.5,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.904320987654321,
      "grad_norm": 1.7647614881243303,
      "kl": 0.187744140625,
      "learning_rate": 4.710594318003361e-07,
      "loss": -0.179,
      "num_tokens": 7940780.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1901835799217224,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 810.125,
      "completions/mean_terminated_length": 738.8333740234375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.9074074074074074,
      "grad_norm": 1.7120283526762177,
      "kl": 0.187744140625,
      "learning_rate": 4.7082550168239423e-07,
      "loss": -0.1556,
      "num_tokens": 7972860.0,
      "reward": 0.02812500298023224,
      "reward_std": 0.12490952014923096,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 673.75,
      "completions/mean_terminated_length": 623.7142944335938,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 0.9104938271604939,
      "grad_norm": 2.6500365855705112,
      "kl": 0.1890869140625,
      "learning_rate": 4.705906884917573e-07,
      "loss": -0.2749,
      "num_tokens": 8000448.0,
      "reward": 0.02812500111758709,
      "reward_std": 0.12204517424106598,
      "rewards/format_reward_func/mean": 3.3527612686157227e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 792.46875,
      "completions/mean_terminated_length": 715.2916870117188,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.9135802469135802,
      "grad_norm": 2.1181226483949627,
      "kl": 0.2181396484375,
      "learning_rate": 4.703549931674345e-07,
      "loss": -0.0669,
      "num_tokens": 8032479.0,
      "reward": 0.0,
      "reward_std": 0.1861894428730011,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 683.3125,
      "completions/mean_terminated_length": 634.6428833007812,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "epoch": 0.9166666666666666,
      "grad_norm": 1.6382130671490476,
      "kl": 0.1922607421875,
      "learning_rate": 4.7011841665196227e-07,
      "loss": -0.1543,
      "num_tokens": 8060317.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.2640279233455658,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 799.5,
      "completions/mean_terminated_length": 736.6399536132812,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 0.9197530864197531,
      "grad_norm": 1.7764343185319038,
      "kl": 0.1827392578125,
      "learning_rate": 4.6988095989140096e-07,
      "loss": -0.1012,
      "num_tokens": 8092253.0,
      "reward": 6.51925802230835e-09,
      "reward_std": 0.15763449668884277,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 750.625,
      "completions/mean_terminated_length": 687.5385131835938,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.9228395061728395,
      "grad_norm": 1.7056692070578798,
      "kl": 0.2401123046875,
      "learning_rate": 4.6964262383533114e-07,
      "loss": -0.0217,
      "num_tokens": 8122829.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.22097495198249817,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 764.4375,
      "completions/mean_terminated_length": 704.5385131835938,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.9259259259259259,
      "grad_norm": 1.065816113120465,
      "kl": 0.1832275390625,
      "learning_rate": 4.694034094368495e-07,
      "loss": -0.0082,
      "num_tokens": 8153715.0,
      "reward": 0.02812500111758709,
      "reward_std": 0.11321557313203812,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 806.0,
      "completions/mean_terminated_length": 706.9091186523438,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 0.9290123456790124,
      "grad_norm": 2.2304382121233277,
      "kl": 0.2069091796875,
      "learning_rate": 4.691633176525651e-07,
      "loss": -0.0853,
      "num_tokens": 8186447.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.27420520782470703,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 841.59375,
      "completions/mean_terminated_length": 790.5199584960938,
      "completions/min_length": 615.0,
      "completions/min_terminated_length": 615.0,
      "epoch": 0.9320987654320988,
      "grad_norm": 1.7871214196940068,
      "kl": 0.189208984375,
      "learning_rate": 4.689223494425959e-07,
      "loss": -0.2181,
      "num_tokens": 8219918.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.27329564094543457,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 694.5625,
      "completions/mean_terminated_length": 647.5,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.9351851851851852,
      "grad_norm": 1.26431278276871,
      "kl": 0.2164306640625,
      "learning_rate": 4.686805057705645e-07,
      "loss": -0.0975,
      "num_tokens": 8248556.0,
      "reward": 0.028124993667006493,
      "reward_std": 0.12057674676179886,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 802.3125,
      "completions/mean_terminated_length": 715.5652465820312,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.9382716049382716,
      "grad_norm": 1.6362305459893027,
      "kl": 0.189697265625,
      "learning_rate": 4.684377876035944e-07,
      "loss": -0.1168,
      "num_tokens": 8280638.0,
      "reward": 0.0,
      "reward_std": 0.08027060329914093,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 806.15625,
      "completions/mean_terminated_length": 692.047607421875,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 0.941358024691358,
      "grad_norm": 1.361930129716925,
      "kl": 0.18896484375,
      "learning_rate": 4.681941959123063e-07,
      "loss": 0.0299,
      "num_tokens": 8312699.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.10160572826862335,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 808.15625,
      "completions/mean_terminated_length": 747.719970703125,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 0.9444444444444444,
      "grad_norm": 1.4194116880842143,
      "kl": 0.1986083984375,
      "learning_rate": 4.6794973167081397e-07,
      "loss": -0.1375,
      "num_tokens": 8345244.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18914230167865753,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 767.5625,
      "completions/mean_terminated_length": 730.9285888671875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.9475308641975309,
      "grad_norm": 1.3125333171183138,
      "kl": 0.208251953125,
      "learning_rate": 4.6770439585672046e-07,
      "loss": -0.0046,
      "num_tokens": 8375882.0,
      "reward": 0.028124993667006493,
      "reward_std": 0.10480768978595734,
      "rewards/format_reward_func/mean": -2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 769.25,
      "completions/mean_terminated_length": 697.9199829101562,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 0.9506172839506173,
      "grad_norm": 1.2496521635499362,
      "kl": 0.2099609375,
      "learning_rate": 4.6745818945111426e-07,
      "loss": -0.0655,
      "num_tokens": 8407654.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977146148682,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 681.375,
      "completions/mean_terminated_length": 632.4285888671875,
      "completions/min_length": 267.0,
      "completions/min_terminated_length": 267.0,
      "epoch": 0.9537037037037037,
      "grad_norm": 3.192894771680404,
      "kl": 0.2269287109375,
      "learning_rate": 4.6721111343856547e-07,
      "loss": -0.2669,
      "num_tokens": 8435982.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.13851019740104675,
      "rewards/format_reward_func/mean": 2.60770320892334e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 799.65625,
      "completions/mean_terminated_length": 697.6818237304688,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 0.9567901234567902,
      "grad_norm": 1.7182349275263777,
      "kl": 0.2216796875,
      "learning_rate": 4.669631688071214e-07,
      "loss": -0.1529,
      "num_tokens": 8468247.0,
      "reward": 0.0,
      "reward_std": 0.09106835722923279,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 780.9375,
      "completions/mean_terminated_length": 712.8800048828125,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 0.9598765432098766,
      "grad_norm": 2.0344590490945773,
      "kl": 0.2119140625,
      "learning_rate": 4.667143565483032e-07,
      "loss": -0.1942,
      "num_tokens": 8499581.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.10064592957496643,
      "rewards/format_reward_func/mean": -2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 850.5625,
      "completions/mean_terminated_length": 771.727294921875,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 0.9629629629629629,
      "grad_norm": 1.387639138328901,
      "kl": 0.205810546875,
      "learning_rate": 4.664646776571015e-07,
      "loss": -0.0767,
      "num_tokens": 8532843.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.11077355593442917,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 778.53125,
      "completions/mean_terminated_length": 682.478271484375,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 0.9660493827160493,
      "grad_norm": 1.453385688103117,
      "kl": 0.1954345703125,
      "learning_rate": 4.662141331319726e-07,
      "loss": -0.0833,
      "num_tokens": 8563888.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.18955133855342865,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 805.28125,
      "completions/mean_terminated_length": 744.0399780273438,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.9691358024691358,
      "grad_norm": 1.5846198018953086,
      "kl": 0.237060546875,
      "learning_rate": 4.6596272397483445e-07,
      "loss": -0.0878,
      "num_tokens": 8596345.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.14731834828853607,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 802.65625,
      "completions/mean_terminated_length": 728.875,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 0.9722222222222222,
      "grad_norm": 1.4165274726711845,
      "kl": 0.18896484375,
      "learning_rate": 4.657104511910626e-07,
      "loss": -0.1436,
      "num_tokens": 8628278.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.15477585792541504,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 739.9375,
      "completions/mean_terminated_length": 674.3846435546875,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.9753086419753086,
      "grad_norm": 1.4267112007514666,
      "kl": 0.207275390625,
      "learning_rate": 4.654573157894861e-07,
      "loss": -0.1971,
      "num_tokens": 8658600.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.07367793470621109,
      "rewards/format_reward_func/mean": -2.60770320892334e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 835.5625,
      "completions/mean_terminated_length": 761.8261108398438,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 0.9783950617283951,
      "grad_norm": 1.2429530468159484,
      "kl": 0.207275390625,
      "learning_rate": 4.652033187823838e-07,
      "loss": 0.0084,
      "num_tokens": 8692498.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.1097278892993927,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 810.1875,
      "completions/mean_terminated_length": 713.0,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 0.9814814814814815,
      "grad_norm": 1.3244109968980724,
      "kl": 0.1849365234375,
      "learning_rate": 4.6494846118548e-07,
      "loss": -0.0008,
      "num_tokens": 8725076.0,
      "reward": 0.0,
      "reward_std": 0.18432721495628357,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 847.21875,
      "completions/mean_terminated_length": 788.2916870117188,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 0.9845679012345679,
      "grad_norm": 1.214473210632424,
      "kl": 0.2095947265625,
      "learning_rate": 4.6469274401794044e-07,
      "loss": -0.0535,
      "num_tokens": 8758731.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.11348775029182434,
      "rewards/format_reward_func/mean": -2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 828.25,
      "completions/mean_terminated_length": 751.6521606445312,
      "completions/min_length": 515.0,
      "completions/min_terminated_length": 515.0,
      "epoch": 0.9876543209876543,
      "grad_norm": 1.8287834018417068,
      "kl": 0.2266845703125,
      "learning_rate": 4.6443616830236823e-07,
      "loss": -0.0622,
      "num_tokens": 8791855.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.16360794007778168,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 825.6875,
      "completions/mean_terminated_length": 759.5833740234375,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 0.9907407407407407,
      "grad_norm": 1.2917476118963442,
      "kl": 0.2122802734375,
      "learning_rate": 4.641787350647997e-07,
      "loss": -0.018,
      "num_tokens": 8824525.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.12261858582496643,
      "rewards/format_reward_func/mean": -2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 720.71875,
      "completions/mean_terminated_length": 677.3928833007812,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.9938271604938271,
      "grad_norm": 2.315236983754253,
      "kl": 0.2188720703125,
      "learning_rate": 4.6392044533470053e-07,
      "loss": -0.0938,
      "num_tokens": 8854212.0,
      "reward": 0.0,
      "reward_std": 0.18609295785427094,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 793.46875,
      "completions/mean_terminated_length": 760.5357666015625,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 0.9969135802469136,
      "grad_norm": 1.4607945026342088,
      "kl": 0.2222900390625,
      "learning_rate": 4.636613001449615e-07,
      "loss": 0.0201,
      "num_tokens": 8885343.0,
      "reward": 0.02812500298023224,
      "reward_std": 0.12921050190925598,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 783.78125,
      "completions/mean_terminated_length": 739.2963256835938,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 1.0,
      "grad_norm": 1.2742956031622807,
      "kl": 0.1968994140625,
      "learning_rate": 4.6340130053189417e-07,
      "loss": -0.0566,
      "num_tokens": 8916640.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09852586686611176,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 766.53125,
      "completions/mean_terminated_length": 718.8518676757812,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 1.0030864197530864,
      "grad_norm": 1.3386985658527497,
      "kl": 0.2254638671875,
      "learning_rate": 4.6314044753522703e-07,
      "loss": -0.0188,
      "num_tokens": 8947637.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.12057675421237946,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 823.65625,
      "completions/mean_terminated_length": 745.2608642578125,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 1.0061728395061729,
      "grad_norm": 1.045012885803152,
      "kl": 0.225341796875,
      "learning_rate": 4.6287874219810117e-07,
      "loss": -0.0212,
      "num_tokens": 8980434.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.05696558207273483,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 760.21875,
      "completions/mean_terminated_length": 672.2916870117188,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 1.0092592592592593,
      "grad_norm": 2.070829051629741,
      "kl": 0.200439453125,
      "learning_rate": 4.626161855670663e-07,
      "loss": -0.1193,
      "num_tokens": 9011305.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.18401594460010529,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 794.75,
      "completions/mean_terminated_length": 705.0435180664062,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 1.0123456790123457,
      "grad_norm": 1.391068659844964,
      "kl": 0.260986328125,
      "learning_rate": 4.623527786920761e-07,
      "loss": -0.0676,
      "num_tokens": 9043449.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.06405126303434372,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 751.03125,
      "completions/mean_terminated_length": 700.4815063476562,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 1.0154320987654322,
      "grad_norm": 1.9555365607649131,
      "kl": 0.24267578125,
      "learning_rate": 4.620885226264847e-07,
      "loss": -0.1742,
      "num_tokens": 9074510.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.07151594012975693,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 757.53125,
      "completions/mean_terminated_length": 708.1851806640625,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 1.0185185185185186,
      "grad_norm": 1.2916364207005597,
      "kl": 0.225341796875,
      "learning_rate": 4.6182341842704177e-07,
      "loss": -0.0527,
      "num_tokens": 9105459.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.11321558058261871,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 800.5,
      "completions/mean_terminated_length": 713.0435180664062,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 1.021604938271605,
      "grad_norm": 1.8860469273746234,
      "kl": 0.24658203125,
      "learning_rate": 4.6155746715388903e-07,
      "loss": -0.1312,
      "num_tokens": 9137911.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.09607689082622528,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 797.59375,
      "completions/mean_terminated_length": 694.6818237304688,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 1.0246913580246915,
      "grad_norm": 0.8196547752979229,
      "kl": 0.232421875,
      "learning_rate": 4.6129066987055533e-07,
      "loss": 0.0111,
      "num_tokens": 9170582.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.07786067575216293,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 800.0,
      "completions/mean_terminated_length": 698.1818237304688,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 1.0277777777777777,
      "grad_norm": 1.6017538171568595,
      "kl": 0.2315673828125,
      "learning_rate": 4.610230276439526e-07,
      "loss": -0.134,
      "num_tokens": 9202658.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.06900564581155777,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 753.15625,
      "completions/mean_terminated_length": 714.4642944335938,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 1.0308641975308641,
      "grad_norm": 1.3638967708903083,
      "kl": 0.2313232421875,
      "learning_rate": 4.607545415443721e-07,
      "loss": -0.1003,
      "num_tokens": 9232935.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.12381581962108612,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 791.9375,
      "completions/mean_terminated_length": 738.3846435546875,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 1.0339506172839505,
      "grad_norm": 1.3389285558071031,
      "kl": 0.2313232421875,
      "learning_rate": 4.604852126454792e-07,
      "loss": 0.0027,
      "num_tokens": 9264865.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.13184289634227753,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 782.6875,
      "completions/mean_terminated_length": 738.0,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 1.037037037037037,
      "grad_norm": 1.0859184709201632,
      "kl": 0.2322998046875,
      "learning_rate": 4.6021504202430983e-07,
      "loss": -0.0282,
      "num_tokens": 9296891.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.059618230909109116,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 761.0,
      "completions/mean_terminated_length": 673.3333740234375,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 1.0401234567901234,
      "grad_norm": 1.5540858164555094,
      "kl": 0.2288818359375,
      "learning_rate": 4.599440307612661e-07,
      "loss": 0.0036,
      "num_tokens": 9327651.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.1387912631034851,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 826.90625,
      "completions/mean_terminated_length": 771.719970703125,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 1.0432098765432098,
      "grad_norm": 1.0394281470351463,
      "kl": 0.2412109375,
      "learning_rate": 4.5967217994011144e-07,
      "loss": 0.0169,
      "num_tokens": 9360664.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.12057675421237946,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 779.09375,
      "completions/mean_terminated_length": 683.2608642578125,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 1.0462962962962963,
      "grad_norm": 1.0995136848421587,
      "kl": 0.221923828125,
      "learning_rate": 4.593994906479669e-07,
      "loss": -0.0187,
      "num_tokens": 9392319.0,
      "reward": 0.0,
      "reward_std": 0.14918872714042664,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 806.75,
      "completions/mean_terminated_length": 692.952392578125,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 1.0493827160493827,
      "grad_norm": 1.5452541460119205,
      "kl": 0.2412109375,
      "learning_rate": 4.591259639753066e-07,
      "loss": -0.1518,
      "num_tokens": 9424551.0,
      "reward": 0.0,
      "reward_std": 0.18690168857574463,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 755.8125,
      "completions/mean_terminated_length": 706.1481323242188,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 1.0524691358024691,
      "grad_norm": 1.6743990686258594,
      "kl": 0.239990234375,
      "learning_rate": 4.588516010159529e-07,
      "loss": -0.1598,
      "num_tokens": 9455101.0,
      "reward": 0.0,
      "reward_std": 0.1832430213689804,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 807.78125,
      "completions/mean_terminated_length": 723.1739501953125,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 1.0555555555555556,
      "grad_norm": 1.421654947289587,
      "kl": 0.245361328125,
      "learning_rate": 4.58576402867073e-07,
      "loss": -0.0177,
      "num_tokens": 9487994.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.09607689082622528,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 775.84375,
      "completions/mean_terminated_length": 729.888916015625,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 1.058641975308642,
      "grad_norm": 1.851952885497318,
      "kl": 0.2291259765625,
      "learning_rate": 4.5830037062917373e-07,
      "loss": -0.0178,
      "num_tokens": 9519473.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.18941769003868103,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 806.8125,
      "completions/mean_terminated_length": 721.8261108398438,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 1.0617283950617284,
      "grad_norm": 0.9903122061679567,
      "kl": 0.22900390625,
      "learning_rate": 4.580235054060971e-07,
      "loss": -0.0547,
      "num_tokens": 9552079.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.056965578347444534,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 742.1875,
      "completions/mean_terminated_length": 677.1538696289062,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 1.0648148148148149,
      "grad_norm": 1.017191215633938,
      "kl": 0.2313232421875,
      "learning_rate": 4.5774580830501685e-07,
      "loss": -0.0368,
      "num_tokens": 9582397.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 807.78125,
      "completions/mean_terminated_length": 639.6111450195312,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 1.0679012345679013,
      "grad_norm": 1.271190440324145,
      "kl": 0.3494873046875,
      "learning_rate": 4.574672804364329e-07,
      "loss": 0.0452,
      "num_tokens": 9614622.0,
      "reward": 0.0,
      "reward_std": 0.14905758202075958,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 799.46875,
      "completions/mean_terminated_length": 724.625,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 1.0709876543209877,
      "grad_norm": 1.2963272797093504,
      "kl": 0.246826171875,
      "learning_rate": 4.571879229141674e-07,
      "loss": -0.0761,
      "num_tokens": 9646701.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.15223580598831177,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 729.34375,
      "completions/mean_terminated_length": 698.862060546875,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 1.074074074074074,
      "grad_norm": 1.8582353993105871,
      "kl": 0.2420654296875,
      "learning_rate": 4.5690773685536037e-07,
      "loss": -0.0516,
      "num_tokens": 9676248.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.18624797463417053,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 736.75,
      "completions/mean_terminated_length": 670.4615478515625,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 1.0771604938271604,
      "grad_norm": 1.3292785642013,
      "kl": 0.2664794921875,
      "learning_rate": 4.5662672338046513e-07,
      "loss": -0.0511,
      "num_tokens": 9706268.0,
      "reward": -4.190951585769653e-09,
      "reward_std": 0.06405126303434372,
      "rewards/format_reward_func/mean": -3.725290298461914e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 769.125,
      "completions/mean_terminated_length": 684.1666870117188,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 1.0802469135802468,
      "grad_norm": 1.4656789656259415,
      "kl": 0.263916015625,
      "learning_rate": 4.5634488361324386e-07,
      "loss": -0.057,
      "num_tokens": 9737428.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.06749087572097778,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 682.5,
      "completions/mean_terminated_length": 633.7142944335938,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "epoch": 1.0833333333333333,
      "grad_norm": 1.6015260587938807,
      "kl": 0.2655029296875,
      "learning_rate": 4.560622186807628e-07,
      "loss": -0.1084,
      "num_tokens": 9765428.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.08606629073619843,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 730.4375,
      "completions/mean_terminated_length": 632.5833740234375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 1.0864197530864197,
      "grad_norm": 1.545253708649919,
      "kl": 0.247802734375,
      "learning_rate": 4.5577872971338826e-07,
      "loss": -0.0316,
      "num_tokens": 9795078.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.1365205943584442,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 820.65625,
      "completions/mean_terminated_length": 714.1428833007812,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 1.0895061728395061,
      "grad_norm": 0.8123871552169197,
      "kl": 0.246337890625,
      "learning_rate": 4.554944178447816e-07,
      "loss": -0.0042,
      "num_tokens": 9827747.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 750.9375,
      "completions/mean_terminated_length": 711.9285888671875,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 1.0925925925925926,
      "grad_norm": 1.3114040538237064,
      "kl": 0.2572021484375,
      "learning_rate": 4.552092842118952e-07,
      "loss": 0.0155,
      "num_tokens": 9858769.0,
      "reward": 0.0,
      "reward_std": 0.07440169155597687,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 758.8125,
      "completions/mean_terminated_length": 638.2727661132812,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 1.095679012345679,
      "grad_norm": 1.9687993176615746,
      "kl": 1.0830078125,
      "learning_rate": 4.549233299549674e-07,
      "loss": -0.0936,
      "num_tokens": 9889595.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.21730080246925354,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 741.125,
      "completions/mean_terminated_length": 675.84619140625,
      "completions/min_length": 301.0,
      "completions/min_terminated_length": 301.0,
      "epoch": 1.0987654320987654,
      "grad_norm": 1.4586965770015488,
      "kl": 0.2420654296875,
      "learning_rate": 4.546365562175184e-07,
      "loss": -0.0696,
      "num_tokens": 9919671.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.08254127204418182,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 779.53125,
      "completions/mean_terminated_length": 744.607177734375,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 1.1018518518518519,
      "grad_norm": 1.1097968945572727,
      "kl": 0.264892578125,
      "learning_rate": 4.543489641463452e-07,
      "loss": -0.0138,
      "num_tokens": 9951212.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.059618234634399414,
      "rewards/format_reward_func/mean": -2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 708.125,
      "completions/mean_terminated_length": 675.4483032226562,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 1.1049382716049383,
      "grad_norm": 1.4865111899797392,
      "kl": 0.257568359375,
      "learning_rate": 4.540605548915175e-07,
      "loss": -0.0236,
      "num_tokens": 9980156.0,
      "reward": 0.028124993667006493,
      "reward_std": 0.1413259506225586,
      "rewards/format_reward_func/mean": -2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 772.03125,
      "completions/mean_terminated_length": 736.0357666015625,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 1.1080246913580247,
      "grad_norm": 1.5675281959330434,
      "kl": 0.25,
      "learning_rate": 4.537713296063729e-07,
      "loss": -0.1305,
      "num_tokens": 10011309.0,
      "reward": 4.6566128730773926e-09,
      "reward_std": 0.08913275599479675,
      "rewards/format_reward_func/mean": 3.725290298461914e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 701.84375,
      "completions/mean_terminated_length": 642.1851806640625,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 1.1111111111111112,
      "grad_norm": 1.516405493949358,
      "kl": 0.257080078125,
      "learning_rate": 4.534812894475122e-07,
      "loss": -0.0769,
      "num_tokens": 10040164.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.18401594460010529,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 736.40625,
      "completions/mean_terminated_length": 683.1481323242188,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 1.1141975308641976,
      "grad_norm": 1.241367741678557,
      "kl": 0.259033203125,
      "learning_rate": 4.5319043557479474e-07,
      "loss": -0.0822,
      "num_tokens": 10070633.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.094046451151371,
      "rewards/format_reward_func/mean": -2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 826.4375,
      "completions/mean_terminated_length": 722.952392578125,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 1.117283950617284,
      "grad_norm": 1.113618964831504,
      "kl": 0.2376708984375,
      "learning_rate": 4.5289876915133394e-07,
      "loss": -0.0292,
      "num_tokens": 10103715.0,
      "reward": 0.05624999478459358,
      "reward_std": 0.11115353554487228,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0625,
      "rewards/logprob_reward/std": 0.24593468010425568,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 718.375,
      "completions/mean_terminated_length": 674.7142944335938,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 1.1203703703703705,
      "grad_norm": 1.8652216196467917,
      "kl": 0.2601318359375,
      "learning_rate": 4.5260629134349284e-07,
      "loss": -0.1558,
      "num_tokens": 10132667.0,
      "reward": 0.0,
      "reward_std": 0.1673842817544937,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 670.84375,
      "completions/mean_terminated_length": 634.3103637695312,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 1.123456790123457,
      "grad_norm": 1.8852861657192153,
      "kl": 0.2645263671875,
      "learning_rate": 4.523130033208788e-07,
      "loss": -0.0096,
      "num_tokens": 10160562.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.21927371621131897,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 774.96875,
      "completions/mean_terminated_length": 644.5238037109375,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 1.126543209876543,
      "grad_norm": 1.5917623773340683,
      "kl": 0.2445068359375,
      "learning_rate": 4.520189062563393e-07,
      "loss": 0.0687,
      "num_tokens": 10192437.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.188092902302742,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 707.8125,
      "completions/mean_terminated_length": 649.25927734375,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 1.1296296296296295,
      "grad_norm": 2.12568058540548,
      "kl": 0.2535400390625,
      "learning_rate": 4.5172400132595737e-07,
      "loss": -0.111,
      "num_tokens": 10221219.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.1345728486776352,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 681.4375,
      "completions/mean_terminated_length": 618.0,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 1.132716049382716,
      "grad_norm": 1.4880863233115877,
      "kl": 0.2850341796875,
      "learning_rate": 4.514282897090464e-07,
      "loss": -0.0282,
      "num_tokens": 10249201.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.13184288144111633,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 776.5625,
      "completions/mean_terminated_length": 646.952392578125,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 1.1358024691358024,
      "grad_norm": 0.016900480733155743,
      "kl": 0.2630615234375,
      "learning_rate": 4.511317725881457e-07,
      "loss": 0.0003,
      "num_tokens": 10281067.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 702.125,
      "completions/mean_terminated_length": 642.5184936523438,
      "completions/min_length": 221.0,
      "completions/min_terminated_length": 221.0,
      "epoch": 1.1388888888888888,
      "grad_norm": 1.3593715604492063,
      "kl": 0.2454833984375,
      "learning_rate": 4.50834451149016e-07,
      "loss": -0.0019,
      "num_tokens": 10309879.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.11906316876411438,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 682.5625,
      "completions/mean_terminated_length": 647.2413940429688,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 1.1419753086419753,
      "grad_norm": 1.354772613119782,
      "kl": 0.3011474609375,
      "learning_rate": 4.505363265806342e-07,
      "loss": -0.0109,
      "num_tokens": 10338141.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.07151594012975693,
      "rewards/format_reward_func/mean": 2.9802322387695312e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 707.4375,
      "completions/mean_terminated_length": 634.3846435546875,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 1.1450617283950617,
      "grad_norm": 1.4972720180071932,
      "kl": 0.2691650390625,
      "learning_rate": 4.502374000751891e-07,
      "loss": -0.0622,
      "num_tokens": 10367407.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.17211824655532837,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 753.1875,
      "completions/mean_terminated_length": 647.2174072265625,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 1.1481481481481481,
      "grad_norm": 1.8175021168347008,
      "kl": 0.275634765625,
      "learning_rate": 4.49937672828076e-07,
      "loss": -0.0613,
      "num_tokens": 10397509.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.23933574557304382,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 709.5,
      "completions/mean_terminated_length": 651.25927734375,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 1.1512345679012346,
      "grad_norm": 1.3041032505629424,
      "kl": 0.2508544921875,
      "learning_rate": 4.4963714603789315e-07,
      "loss": -0.0946,
      "num_tokens": 10426677.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.16686978936195374,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 675.625,
      "completions/mean_terminated_length": 664.3870849609375,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 1.154320987654321,
      "grad_norm": 1.034446464436393,
      "kl": 0.2545166015625,
      "learning_rate": 4.4933582090643516e-07,
      "loss": -0.015,
      "num_tokens": 10454741.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.07786068320274353,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 758.3125,
      "completions/mean_terminated_length": 697.0,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 1.1574074074074074,
      "grad_norm": 1.3057103198966025,
      "kl": 0.263671875,
      "learning_rate": 4.4903369863869e-07,
      "loss": -0.0673,
      "num_tokens": 10485355.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.08985722810029984,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 719.1875,
      "completions/mean_terminated_length": 687.6551513671875,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 1.1604938271604939,
      "grad_norm": 1.2734489858967948,
      "kl": 0.29296875,
      "learning_rate": 4.4873078044283273e-07,
      "loss": -0.0296,
      "num_tokens": 10514849.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.06900564581155777,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 766.34375,
      "completions/mean_terminated_length": 729.5357666015625,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 1.1635802469135803,
      "grad_norm": 1.6566510683607516,
      "kl": 0.2490234375,
      "learning_rate": 4.484270675302218e-07,
      "loss": 0.0077,
      "num_tokens": 10546096.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.14731836318969727,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 767.09375,
      "completions/mean_terminated_length": 695.1599731445312,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 1.1666666666666667,
      "grad_norm": 1.3566675053102628,
      "kl": 0.292236328125,
      "learning_rate": 4.481225611153933e-07,
      "loss": -0.0718,
      "num_tokens": 10577267.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.06749087572097778,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 872.78125,
      "completions/mean_terminated_length": 813.6087036132812,
      "completions/min_length": 569.0,
      "completions/min_terminated_length": 569.0,
      "epoch": 1.1697530864197532,
      "grad_norm": 1.2471242642242912,
      "kl": 0.2442626953125,
      "learning_rate": 4.4781726241605683e-07,
      "loss": 0.0262,
      "num_tokens": 10612108.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.08027059584856033,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 765.84375,
      "completions/mean_terminated_length": 679.7916870117188,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 1.1728395061728394,
      "grad_norm": 1.30494481174423,
      "kl": 0.3017578125,
      "learning_rate": 4.4751117265309e-07,
      "loss": 0.0063,
      "num_tokens": 10642895.0,
      "reward": 0.0,
      "reward_std": 0.08027060329914093,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 765.59375,
      "completions/mean_terminated_length": 648.1363525390625,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 1.175925925925926,
      "grad_norm": 0.9044069185541123,
      "kl": 0.275390625,
      "learning_rate": 4.472042930505342e-07,
      "loss": -0.0351,
      "num_tokens": 10674298.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 734.90625,
      "completions/mean_terminated_length": 668.1923217773438,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 1.1790123456790123,
      "grad_norm": 1.3501100187377286,
      "kl": 0.2606201171875,
      "learning_rate": 4.46896624835589e-07,
      "loss": -0.0829,
      "num_tokens": 10704623.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.15223580598831177,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 799.0,
      "completions/mean_terminated_length": 724.0,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 1.1820987654320987,
      "grad_norm": 1.2447347049204807,
      "kl": 0.262939453125,
      "learning_rate": 4.465881692386078e-07,
      "loss": 0.0098,
      "num_tokens": 10736651.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.11206148564815521,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 683.75,
      "completions/mean_terminated_length": 620.74072265625,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 1.1851851851851851,
      "grad_norm": 1.5798290251530331,
      "kl": 0.2781982421875,
      "learning_rate": 4.4627892749309273e-07,
      "loss": -0.0459,
      "num_tokens": 10764883.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.1861894279718399,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 746.28125,
      "completions/mean_terminated_length": 694.8518676757812,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 1.1882716049382716,
      "grad_norm": 1.2819675135830302,
      "kl": 0.2896728515625,
      "learning_rate": 4.459689008356896e-07,
      "loss": -0.0129,
      "num_tokens": 10794992.0,
      "reward": 0.0,
      "reward_std": 0.07886750996112823,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 759.125,
      "completions/mean_terminated_length": 698.0,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 1.191358024691358,
      "grad_norm": 1.2606431620360652,
      "kl": 0.2867431640625,
      "learning_rate": 4.4565809050618317e-07,
      "loss": -0.0015,
      "num_tokens": 10825748.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.12525564432144165,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 799.0625,
      "completions/mean_terminated_length": 664.1000366210938,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 1.1944444444444444,
      "grad_norm": 0.9421573699780736,
      "kl": 0.2657470703125,
      "learning_rate": 4.45346497747492e-07,
      "loss": -0.0066,
      "num_tokens": 10857862.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 755.75,
      "completions/mean_terminated_length": 680.6400146484375,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 1.1975308641975309,
      "grad_norm": 0.9977061885065656,
      "kl": 0.36279296875,
      "learning_rate": 4.450341238056634e-07,
      "loss": -0.0194,
      "num_tokens": 10888590.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 763.84375,
      "completions/mean_terminated_length": 677.125,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 1.2006172839506173,
      "grad_norm": 1.3470845527511202,
      "kl": 0.27880859375,
      "learning_rate": 4.4472096992986895e-07,
      "loss": 0.0067,
      "num_tokens": 10919857.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.09305032342672348,
      "rewards/format_reward_func/mean": 1.1175870895385742e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 673.78125,
      "completions/mean_terminated_length": 637.5516967773438,
      "completions/min_length": 292.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 1.2037037037037037,
      "grad_norm": 1.3016995988610571,
      "kl": 0.302490234375,
      "learning_rate": 4.444070373723989e-07,
      "loss": -0.0277,
      "num_tokens": 10947570.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.13184288144111633,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 775.5625,
      "completions/mean_terminated_length": 692.75,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 1.2067901234567902,
      "grad_norm": 1.3185866373896116,
      "kl": 0.274658203125,
      "learning_rate": 4.4409232738865744e-07,
      "loss": 0.0107,
      "num_tokens": 10979024.0,
      "reward": 0.028124993667006493,
      "reward_std": 0.12057674676179886,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 755.28125,
      "completions/mean_terminated_length": 665.7083740234375,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 1.2098765432098766,
      "grad_norm": 1.1114605142361125,
      "kl": 0.3101806640625,
      "learning_rate": 4.4377684123715763e-07,
      "loss": -0.0029,
      "num_tokens": 11009681.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.06432675570249557,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 757.40625,
      "completions/mean_terminated_length": 682.760009765625,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 1.212962962962963,
      "grad_norm": 1.2370891083488853,
      "kl": 0.2557373046875,
      "learning_rate": 4.434605801795167e-07,
      "loss": -0.0249,
      "num_tokens": 11040470.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.12057675421237946,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 742.78125,
      "completions/mean_terminated_length": 632.7391357421875,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 1.2160493827160495,
      "grad_norm": 0.9488392827947685,
      "kl": 0.296630859375,
      "learning_rate": 4.431435454804503e-07,
      "loss": 0.0133,
      "num_tokens": 11070483.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 735.75,
      "completions/mean_terminated_length": 682.370361328125,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 1.2191358024691359,
      "grad_norm": 1.3788628642821232,
      "kl": 0.288330078125,
      "learning_rate": 4.42825738407768e-07,
      "loss": -0.0402,
      "num_tokens": 11101003.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19551047682762146,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 782.8125,
      "completions/mean_terminated_length": 702.4166870117188,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 1.2222222222222223,
      "grad_norm": 0.8794678162422529,
      "kl": 0.29345703125,
      "learning_rate": 4.425071602323681e-07,
      "loss": 0.0209,
      "num_tokens": 11132509.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.06432675570249557,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 793.25,
      "completions/mean_terminated_length": 672.3809814453125,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 1.2253086419753085,
      "grad_norm": 1.113998526736297,
      "kl": 0.305908203125,
      "learning_rate": 4.421878122282325e-07,
      "loss": 0.0005,
      "num_tokens": 11164157.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.07151593267917633,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 760.09375,
      "completions/mean_terminated_length": 672.125,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 1.228395061728395,
      "grad_norm": 0.7409330586619888,
      "kl": 0.2860107421875,
      "learning_rate": 4.4186769567242163e-07,
      "loss": 0.0028,
      "num_tokens": 11195028.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 710.21875,
      "completions/mean_terminated_length": 665.3928833007812,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 1.2314814814814814,
      "grad_norm": 1.034515189164874,
      "kl": 0.29443359375,
      "learning_rate": 4.4154681184506927e-07,
      "loss": 0.024,
      "num_tokens": 11224227.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 774.53125,
      "completions/mean_terminated_length": 661.1364135742188,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.2345679012345678,
      "grad_norm": 1.0577007948656316,
      "kl": 0.327392578125,
      "learning_rate": 4.4122516202937745e-07,
      "loss": 0.0348,
      "num_tokens": 11255848.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 764.71875,
      "completions/mean_terminated_length": 727.6785888671875,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 1.2376543209876543,
      "grad_norm": 0.7348152848815923,
      "kl": 0.285888671875,
      "learning_rate": 4.4090274751161144e-07,
      "loss": 0.025,
      "num_tokens": 11287351.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 794.625,
      "completions/mean_terminated_length": 752.1481323242188,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 1.2407407407407407,
      "grad_norm": 1.1781215750943606,
      "kl": 0.31640625,
      "learning_rate": 4.4057956958109453e-07,
      "loss": 0.0225,
      "num_tokens": 11319351.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10609643161296844,
      "rewards/format_reward_func/mean": -7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 789.0,
      "completions/mean_terminated_length": 697.0435180664062,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 1.2438271604938271,
      "grad_norm": 0.014007581081706421,
      "kl": 0.288330078125,
      "learning_rate": 4.402556295302029e-07,
      "loss": 0.0003,
      "num_tokens": 11351103.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 751.21875,
      "completions/mean_terminated_length": 688.2692260742188,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 1.2469135802469136,
      "grad_norm": 1.232008619438482,
      "kl": 0.328125,
      "learning_rate": 4.3993092865436035e-07,
      "loss": -0.03,
      "num_tokens": 11381482.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.07151593267917633,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 752.375,
      "completions/mean_terminated_length": 661.8333740234375,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 1.25,
      "grad_norm": 0.47822770796013664,
      "kl": 0.327392578125,
      "learning_rate": 4.3960546825203304e-07,
      "loss": 0.0151,
      "num_tokens": 11411846.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 844.0,
      "completions/mean_length": 773.09375,
      "completions/mean_terminated_length": 659.0454711914062,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 1.2530864197530864,
      "grad_norm": 0.8680461203097858,
      "kl": 0.306884765625,
      "learning_rate": 4.392792496247248e-07,
      "loss": 0.0099,
      "num_tokens": 11442933.0,
      "reward": 0.0,
      "reward_std": 0.13912895321846008,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 714.53125,
      "completions/mean_terminated_length": 643.1154174804688,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 1.2561728395061729,
      "grad_norm": 1.5318159052688987,
      "kl": 0.319091796875,
      "learning_rate": 4.3895227407697135e-07,
      "loss": -0.022,
      "num_tokens": 11471782.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20109625160694122,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 788.1875,
      "completions/mean_terminated_length": 695.9130859375,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 1.2592592592592593,
      "grad_norm": 1.8787070751135608,
      "kl": 0.2724609375,
      "learning_rate": 4.3862454291633523e-07,
      "loss": 0.1961,
      "num_tokens": 11503556.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.18140104413032532,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 763.09375,
      "completions/mean_terminated_length": 690.0399780273438,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 1.2623456790123457,
      "grad_norm": 2.0333008654259195,
      "kl": 0.324951171875,
      "learning_rate": 4.382960574534009e-07,
      "loss": -0.2294,
      "num_tokens": 11534543.0,
      "reward": -3.259629011154175e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 734.0,
      "completions/mean_terminated_length": 680.2963256835938,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 1.2654320987654322,
      "grad_norm": 1.2146753972907172,
      "kl": 0.32568359375,
      "learning_rate": 4.3796681900176903e-07,
      "loss": 0.0007,
      "num_tokens": 11564487.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.1158682331442833,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 727.875,
      "completions/mean_terminated_length": 708.1333618164062,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 1.2685185185185186,
      "grad_norm": 1.2463339846434414,
      "kl": 0.32421875,
      "learning_rate": 4.3763682887805153e-07,
      "loss": -0.0302,
      "num_tokens": 11594387.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.11586824059486389,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 733.40625,
      "completions/mean_terminated_length": 679.5925903320312,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 1.2716049382716048,
      "grad_norm": 1.1210913026388611,
      "kl": 0.33154296875,
      "learning_rate": 4.3730608840186625e-07,
      "loss": 0.0047,
      "num_tokens": 11624296.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.1024516224861145,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 732.09375,
      "completions/mean_terminated_length": 678.0370483398438,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 1.2746913580246915,
      "grad_norm": 0.9685326971132457,
      "kl": 0.3294677734375,
      "learning_rate": 4.3697459889583166e-07,
      "loss": 0.0148,
      "num_tokens": 11654427.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 715.90625,
      "completions/mean_terminated_length": 644.8077392578125,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 1.2777777777777777,
      "grad_norm": 1.048193580651815,
      "kl": 0.3486328125,
      "learning_rate": 4.366423616855615e-07,
      "loss": -0.068,
      "num_tokens": 11683920.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 733.15625,
      "completions/mean_terminated_length": 691.607177734375,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 1.2808641975308643,
      "grad_norm": 1.2887510459006943,
      "kl": 0.307373046875,
      "learning_rate": 4.363093780996596e-07,
      "loss": -0.0282,
      "num_tokens": 11714213.0,
      "reward": 0.0,
      "reward_std": 0.14231424033641815,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 755.875,
      "completions/mean_terminated_length": 694.0,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 1.2839506172839505,
      "grad_norm": 0.008248914693801979,
      "kl": 0.29150390625,
      "learning_rate": 4.359756494697146e-07,
      "loss": 0.0003,
      "num_tokens": 11745765.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 802.6875,
      "completions/mean_terminated_length": 716.0869750976562,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 1.287037037037037,
      "grad_norm": 1.1157228360960278,
      "kl": 0.314453125,
      "learning_rate": 4.356411771302944e-07,
      "loss": -0.0366,
      "num_tokens": 11778179.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 764.21875,
      "completions/mean_terminated_length": 677.625,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 1.2901234567901234,
      "grad_norm": 0.7525352377728132,
      "kl": 0.339111328125,
      "learning_rate": 4.353059624189411e-07,
      "loss": 0.0033,
      "num_tokens": 11808910.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.10225499421358109,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 754.3125,
      "completions/mean_terminated_length": 704.370361328125,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 1.2932098765432098,
      "grad_norm": 1.2550176381062912,
      "kl": 0.3251953125,
      "learning_rate": 4.3497000667616534e-07,
      "loss": 0.0052,
      "num_tokens": 11839792.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.1158682331442833,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 763.53125,
      "completions/mean_terminated_length": 715.2963256835938,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 1.2962962962962963,
      "grad_norm": 0.5930065562985071,
      "kl": 0.3135986328125,
      "learning_rate": 4.346333112454413e-07,
      "loss": 0.0011,
      "num_tokens": 11870997.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 737.9375,
      "completions/mean_terminated_length": 697.0714721679688,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 1.2993827160493827,
      "grad_norm": 0.5148470707330806,
      "kl": 0.3297119140625,
      "learning_rate": 4.342958774732011e-07,
      "loss": -0.0102,
      "num_tokens": 11901319.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 762.3125,
      "completions/mean_terminated_length": 724.9285888671875,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 1.3024691358024691,
      "grad_norm": 0.8095249053527938,
      "kl": 0.318359375,
      "learning_rate": 4.3395770670882935e-07,
      "loss": -0.0128,
      "num_tokens": 11932529.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 719.21875,
      "completions/mean_terminated_length": 675.6785888671875,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 1.3055555555555556,
      "grad_norm": 1.0661536532752252,
      "kl": 0.328857421875,
      "learning_rate": 4.3361880030465803e-07,
      "loss": 0.0062,
      "num_tokens": 11962220.0,
      "reward": 0.0,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 704.3125,
      "completions/mean_terminated_length": 671.2413940429688,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 1.308641975308642,
      "grad_norm": 1.2453204382631962,
      "kl": 0.35400390625,
      "learning_rate": 4.3327915961596066e-07,
      "loss": -0.0105,
      "num_tokens": 11991466.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.07151594012975693,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 729.6875,
      "completions/mean_terminated_length": 675.1851806640625,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 1.3117283950617284,
      "grad_norm": 0.5767029027540111,
      "kl": 0.3193359375,
      "learning_rate": 4.3293878600094746e-07,
      "loss": 0.0027,
      "num_tokens": 12021000.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 755.90625,
      "completions/mean_terminated_length": 694.0385131835938,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 1.3148148148148149,
      "grad_norm": 0.8638429458633945,
      "kl": 0.3359375,
      "learning_rate": 4.325976808207594e-07,
      "loss": -0.0058,
      "num_tokens": 12051457.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.08606424182653427,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 737.53125,
      "completions/mean_terminated_length": 671.423095703125,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 1.3179012345679013,
      "grad_norm": 1.0837469297609128,
      "kl": 0.32470703125,
      "learning_rate": 4.3225584543946303e-07,
      "loss": 0.0135,
      "num_tokens": 12081534.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 703.375,
      "completions/mean_terminated_length": 629.3846435546875,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 1.3209876543209877,
      "grad_norm": 1.4149183005045936,
      "kl": 0.35205078125,
      "learning_rate": 4.319132812240448e-07,
      "loss": -0.0514,
      "num_tokens": 12110210.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.15223580598831177,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 690.125,
      "completions/mean_terminated_length": 642.4285888671875,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 1.324074074074074,
      "grad_norm": 1.3785281938220508,
      "kl": 0.320556640625,
      "learning_rate": 4.3156998954440587e-07,
      "loss": -0.0047,
      "num_tokens": 12138394.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 692.21875,
      "completions/mean_terminated_length": 644.8214721679688,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 1.3271604938271606,
      "grad_norm": 0.5041500125980993,
      "kl": 0.3072509765625,
      "learning_rate": 4.312259717733565e-07,
      "loss": 0.0066,
      "num_tokens": 12166569.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 698.71875,
      "completions/mean_terminated_length": 665.0689697265625,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 1.3302469135802468,
      "grad_norm": 1.3994839331379305,
      "kl": 0.358154296875,
      "learning_rate": 4.308812292866105e-07,
      "loss": -0.0216,
      "num_tokens": 12195120.0,
      "reward": 0.028124993667006493,
      "reward_std": 0.12057674676179886,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 759.6875,
      "completions/mean_terminated_length": 698.6923217773438,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 1.3333333333333333,
      "grad_norm": 1.3113319724659547,
      "kl": 0.33251953125,
      "learning_rate": 4.3053576346277997e-07,
      "loss": -0.0501,
      "num_tokens": 12226218.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 723.3125,
      "completions/mean_terminated_length": 623.0833740234375,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 1.3364197530864197,
      "grad_norm": 1.6940617261107462,
      "kl": 0.386474609375,
      "learning_rate": 4.301895756833692e-07,
      "loss": -0.0873,
      "num_tokens": 12255320.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.16567710041999817,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 787.46875,
      "completions/mean_terminated_length": 679.95458984375,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 1.3395061728395061,
      "grad_norm": 0.9854554475323402,
      "kl": 0.3048095703125,
      "learning_rate": 4.298426673327701e-07,
      "loss": 0.0116,
      "num_tokens": 12287151.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09598580002784729,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1014.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 692.5,
      "completions/mean_terminated_length": 692.5,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 1.3425925925925926,
      "grad_norm": 0.7925101044513421,
      "kl": 0.361328125,
      "learning_rate": 4.2949503979825563e-07,
      "loss": 0.0061,
      "num_tokens": 12315707.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 868.0,
      "completions/mean_length": 704.90625,
      "completions/mean_terminated_length": 631.2692260742188,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 1.345679012345679,
      "grad_norm": 1.1121111546655797,
      "kl": 0.32177734375,
      "learning_rate": 4.2914669446997504e-07,
      "loss": -0.013,
      "num_tokens": 12344912.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15908847749233246,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 717.65625,
      "completions/mean_terminated_length": 685.9655151367188,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 1.3487654320987654,
      "grad_norm": 0.8626115907205927,
      "kl": 0.3309326171875,
      "learning_rate": 4.287976327409478e-07,
      "loss": -0.0177,
      "num_tokens": 12374525.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13241708278656006,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 730.8125,
      "completions/mean_terminated_length": 676.5184936523438,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 1.3518518518518519,
      "grad_norm": 0.8214353649895705,
      "kl": 0.34130859375,
      "learning_rate": 4.284478560070585e-07,
      "loss": 0.0237,
      "num_tokens": 12404283.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 753.78125,
      "completions/mean_terminated_length": 715.1785888671875,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 1.3549382716049383,
      "grad_norm": 1.416194781903389,
      "kl": 0.33544921875,
      "learning_rate": 4.280973656670508e-07,
      "loss": -0.066,
      "num_tokens": 12435016.0,
      "reward": 0.028124993667006493,
      "reward_std": 0.12057674676179886,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 755.71875,
      "completions/mean_terminated_length": 717.3928833007812,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 1.3580246913580247,
      "grad_norm": 1.1659610856616398,
      "kl": 0.3134765625,
      "learning_rate": 4.277461631225221e-07,
      "loss": -0.0346,
      "num_tokens": 12466079.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 644.5,
      "completions/mean_terminated_length": 632.258056640625,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 1.3611111111111112,
      "grad_norm": 0.9054769529161981,
      "kl": 0.34130859375,
      "learning_rate": 4.2739424977791784e-07,
      "loss": -0.0177,
      "num_tokens": 12493131.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 717.3125,
      "completions/mean_terminated_length": 673.5,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 1.3641975308641976,
      "grad_norm": 0.589301737696435,
      "kl": 0.3555908203125,
      "learning_rate": 4.2704162704052594e-07,
      "loss": 0.0094,
      "num_tokens": 12522029.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 799.78125,
      "completions/mean_terminated_length": 725.0416870117188,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 1.367283950617284,
      "grad_norm": 0.009300813154565053,
      "kl": 0.32470703125,
      "learning_rate": 4.2668829632047124e-07,
      "loss": 0.0003,
      "num_tokens": 12554062.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 718.09375,
      "completions/mean_terminated_length": 686.4483032226562,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 1.3703703703703702,
      "grad_norm": 0.7173552794948027,
      "kl": 0.302978515625,
      "learning_rate": 4.2633425903070973e-07,
      "loss": -0.0124,
      "num_tokens": 12583541.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 772.78125,
      "completions/mean_terminated_length": 714.8077392578125,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 1.373456790123457,
      "grad_norm": 0.49122238250860734,
      "kl": 0.288818359375,
      "learning_rate": 4.259795165870229e-07,
      "loss": -0.0027,
      "num_tokens": 12614670.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 753.78125,
      "completions/mean_terminated_length": 691.423095703125,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 1.376543209876543,
      "grad_norm": 2.062589270834479,
      "kl": 0.353515625,
      "learning_rate": 4.256240704080121e-07,
      "loss": -0.1871,
      "num_tokens": 12645067.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 730.03125,
      "completions/mean_terminated_length": 675.5925903320312,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 1.3796296296296298,
      "grad_norm": 1.2127359756000133,
      "kl": 0.3388671875,
      "learning_rate": 4.2526792191509297e-07,
      "loss": 0.0012,
      "num_tokens": 12674968.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 687.15625,
      "completions/mean_terminated_length": 652.3103637695312,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 1.382716049382716,
      "grad_norm": 0.7313600857010765,
      "kl": 0.297607421875,
      "learning_rate": 4.249110725324897e-07,
      "loss": 0.0003,
      "num_tokens": 12703301.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 745.78125,
      "completions/mean_terminated_length": 694.25927734375,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 1.3858024691358024,
      "grad_norm": 1.332798569391717,
      "kl": 0.3165283203125,
      "learning_rate": 4.2455352368722916e-07,
      "loss": -0.0528,
      "num_tokens": 12734442.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.12057675421237946,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 816.125,
      "completions/mean_terminated_length": 721.6364135742188,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 1.3888888888888888,
      "grad_norm": 0.4313526530697991,
      "kl": 0.335693359375,
      "learning_rate": 4.2419527680913554e-07,
      "loss": 0.0069,
      "num_tokens": 12767014.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 735.375,
      "completions/mean_terminated_length": 681.9259033203125,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 1.3919753086419753,
      "grad_norm": 1.4063448246433303,
      "kl": 0.348388671875,
      "learning_rate": 4.2383633333082423e-07,
      "loss": -0.0812,
      "num_tokens": 12797074.0,
      "reward": -4.6566128730773926e-09,
      "reward_std": 0.15007510781288147,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.026798367500305e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 930.0,
      "completions/mean_length": 742.15625,
      "completions/mean_terminated_length": 689.9629516601562,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 1.3950617283950617,
      "grad_norm": 0.8428837510838175,
      "kl": 0.3349609375,
      "learning_rate": 4.234766946876965e-07,
      "loss": 0.0141,
      "num_tokens": 12827079.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1254904866218567,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 717.125,
      "completions/mean_terminated_length": 673.2857666015625,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 1.3981481481481481,
      "grad_norm": 0.009826459446768576,
      "kl": 0.371337890625,
      "learning_rate": 4.231163623179335e-07,
      "loss": 0.0004,
      "num_tokens": 12856795.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 794.125,
      "completions/mean_terminated_length": 689.6364135742188,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 1.4012345679012346,
      "grad_norm": 0.009539250603866232,
      "kl": 0.312744140625,
      "learning_rate": 4.227553376624904e-07,
      "loss": 0.0003,
      "num_tokens": 12889007.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 794.75,
      "completions/mean_terminated_length": 690.5454711914062,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 1.404320987654321,
      "grad_norm": 0.966587225456212,
      "kl": 0.328369140625,
      "learning_rate": 4.22393622165091e-07,
      "loss": 0.0279,
      "num_tokens": 12921271.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.16195404529571533,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 782.09375,
      "completions/mean_terminated_length": 726.269287109375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 1.4074074074074074,
      "grad_norm": 0.6819926506904097,
      "kl": 0.335205078125,
      "learning_rate": 4.220312172722216e-07,
      "loss": 0.0361,
      "num_tokens": 12952590.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 818.90625,
      "completions/mean_terminated_length": 750.5416870117188,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 1.4104938271604939,
      "grad_norm": 0.008401392670728085,
      "kl": 0.34423828125,
      "learning_rate": 4.216681244331256e-07,
      "loss": 0.0003,
      "num_tokens": 12985303.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 784.34375,
      "completions/mean_terminated_length": 704.4583740234375,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.4135802469135803,
      "grad_norm": 0.9587602391256548,
      "kl": 0.353515625,
      "learning_rate": 4.2130434509979714e-07,
      "loss": -0.0278,
      "num_tokens": 13017686.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 734.71875,
      "completions/mean_terminated_length": 704.7930908203125,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 1.4166666666666667,
      "grad_norm": 1.5716828641475282,
      "kl": 0.34423828125,
      "learning_rate": 4.209398807269758e-07,
      "loss": -0.154,
      "num_tokens": 13047677.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15870162844657898,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 778.8125,
      "completions/mean_terminated_length": 710.1599731445312,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 1.4197530864197532,
      "grad_norm": 0.7154739665915979,
      "kl": 0.3125,
      "learning_rate": 4.205747327721407e-07,
      "loss": -0.0229,
      "num_tokens": 13078863.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 781.4375,
      "completions/mean_terminated_length": 700.5833740234375,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 1.4228395061728394,
      "grad_norm": 0.9896365183054785,
      "kl": 0.31884765625,
      "learning_rate": 4.2020890269550454e-07,
      "loss": 0.0131,
      "num_tokens": 13110765.0,
      "reward": 0.0,
      "reward_std": 0.16249045729637146,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 776.0,
      "completions/mean_terminated_length": 678.95654296875,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 1.425925925925926,
      "grad_norm": 0.49623455304372194,
      "kl": 0.3243408203125,
      "learning_rate": 4.198423919600076e-07,
      "loss": 0.0053,
      "num_tokens": 13142181.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 942.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 724.625,
      "completions/mean_terminated_length": 724.625,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 1.4290123456790123,
      "grad_norm": 1.229600254826551,
      "kl": 0.3414306640625,
      "learning_rate": 4.1947520203131217e-07,
      "loss": -0.0044,
      "num_tokens": 13171625.0,
      "reward": 2.3283064365386963e-09,
      "reward_std": 0.20035111904144287,
      "rewards/format_reward_func/mean": 2.2351741790771484e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 778.21875,
      "completions/mean_terminated_length": 721.5,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 1.4320987654320987,
      "grad_norm": 0.8382003160649317,
      "kl": 0.3203125,
      "learning_rate": 4.191073343777968e-07,
      "loss": 0.0069,
      "num_tokens": 13203664.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 464
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 744.3125,
      "completions/mean_terminated_length": 679.7692260742188,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 1.4351851851851851,
      "grad_norm": 0.7937742594266002,
      "kl": NaN,
      "learning_rate": 4.1873879047055005e-07,
      "loss": 0.0015,
      "num_tokens": 13233842.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 780.96875,
      "completions/mean_terminated_length": 685.8695678710938,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "epoch": 1.4382716049382716,
      "grad_norm": 1.2777775438614578,
      "kl": 0.322021484375,
      "learning_rate": 4.183695717833649e-07,
      "loss": -0.0758,
      "num_tokens": 13265621.0,
      "reward": 0.0,
      "reward_std": 0.14349564909934998,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 805.375,
      "completions/mean_terminated_length": 744.1599731445312,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 1.441358024691358,
      "grad_norm": 0.7591786636402239,
      "kl": 0.341064453125,
      "learning_rate": 4.179996797927326e-07,
      "loss": 0.0326,
      "num_tokens": 13297765.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 777.4375,
      "completions/mean_terminated_length": 680.95654296875,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 1.4444444444444444,
      "grad_norm": 0.5209501100963339,
      "kl": 0.3345947265625,
      "learning_rate": 4.17629115977837e-07,
      "loss": 0.0056,
      "num_tokens": 13329271.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 862.5,
      "completions/mean_terminated_length": 765.6000366210938,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 1.4475308641975309,
      "grad_norm": 0.5268008371662545,
      "kl": 0.33154296875,
      "learning_rate": 4.1725788182054867e-07,
      "loss": 0.0309,
      "num_tokens": 13363535.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 809.0,
      "completions/mean_length": 772.40625,
      "completions/mean_terminated_length": 640.6190795898438,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 1.4506172839506173,
      "grad_norm": 0.010361077728183815,
      "kl": 0.324462890625,
      "learning_rate": 4.1688597880541863e-07,
      "loss": 0.0003,
      "num_tokens": 13394608.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 737.59375,
      "completions/mean_terminated_length": 671.5,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 1.4537037037037037,
      "grad_norm": 1.4216610925062358,
      "kl": 0.31201171875,
      "learning_rate": 4.1651340841967284e-07,
      "loss": -0.0691,
      "num_tokens": 13424467.0,
      "reward": 0.0,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 684.28125,
      "completions/mean_terminated_length": 635.75,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 1.4567901234567902,
      "grad_norm": 0.8916789301480571,
      "kl": 0.38720703125,
      "learning_rate": 4.161401721532059e-07,
      "loss": -0.0286,
      "num_tokens": 13452452.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 763.46875,
      "completions/mean_terminated_length": 726.2500610351562,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 1.4598765432098766,
      "grad_norm": 0.009345990794967311,
      "kl": 0.31298828125,
      "learning_rate": 4.1576627149857513e-07,
      "loss": 0.0003,
      "num_tokens": 13483403.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 704.15625,
      "completions/mean_terminated_length": 671.0689697265625,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 1.462962962962963,
      "grad_norm": 1.335597575659992,
      "kl": 0.3358154296875,
      "learning_rate": 4.153917079509952e-07,
      "loss": -0.0718,
      "num_tokens": 13512116.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 786.5625,
      "completions/mean_terminated_length": 678.6364135742188,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 1.4660493827160495,
      "grad_norm": 1.1981375403128411,
      "kl": 0.34423828125,
      "learning_rate": 4.150164830083311e-07,
      "loss": -0.0288,
      "num_tokens": 13543370.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 835.0,
      "completions/mean_terminated_length": 736.0,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 1.4691358024691357,
      "grad_norm": 0.9639150272725308,
      "kl": 0.357666015625,
      "learning_rate": 4.146405981710931e-07,
      "loss": 0.0124,
      "num_tokens": 13576846.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.12057675421237946,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 773.25,
      "completions/mean_terminated_length": 703.0399780273438,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 1.4722222222222223,
      "grad_norm": 0.47758534044903295,
      "kl": 0.3179931640625,
      "learning_rate": 4.142640549424302e-07,
      "loss": 0.0216,
      "num_tokens": 13607994.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 767.03125,
      "completions/mean_terminated_length": 681.375,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 1.4753086419753085,
      "grad_norm": 0.5999650568642271,
      "kl": 0.331787109375,
      "learning_rate": 4.1388685482812413e-07,
      "loss": 0.0102,
      "num_tokens": 13639259.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.055743563920259476,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 763.5625,
      "completions/mean_terminated_length": 726.357177734375,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 1.4783950617283952,
      "grad_norm": 0.8444555416217089,
      "kl": 0.3466796875,
      "learning_rate": 4.135089993365839e-07,
      "loss": 0.0116,
      "num_tokens": 13669713.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 770.125,
      "completions/mean_terminated_length": 699.0399780273438,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 1.4814814814814814,
      "grad_norm": 0.7185053938384726,
      "kl": 0.312255859375,
      "learning_rate": 4.131304899788389e-07,
      "loss": 0.0193,
      "num_tokens": 13700593.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 729.5625,
      "completions/mean_terminated_length": 675.0370483398438,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 1.4845679012345678,
      "grad_norm": 0.014659263925658098,
      "kl": 0.32861328125,
      "learning_rate": 4.127513282685336e-07,
      "loss": 0.0003,
      "num_tokens": 13730239.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 758.6875,
      "completions/mean_terminated_length": 684.3999633789062,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 1.4876543209876543,
      "grad_norm": 1.2735473500356849,
      "kl": 0.326416015625,
      "learning_rate": 4.123715157219211e-07,
      "loss": -0.0272,
      "num_tokens": 13760825.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 776.125,
      "completions/mean_terminated_length": 718.923095703125,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 1.4907407407407407,
      "grad_norm": 1.0599098808411953,
      "kl": 0.336669921875,
      "learning_rate": 4.1199105385785727e-07,
      "loss": -0.038,
      "num_tokens": 13791961.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 767.34375,
      "completions/mean_terminated_length": 681.7916870117188,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 1.4938271604938271,
      "grad_norm": 1.0576775184964902,
      "kl": 0.342529296875,
      "learning_rate": 4.116099441977943e-07,
      "loss": -0.0091,
      "num_tokens": 13822512.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 795.9375,
      "completions/mean_terminated_length": 706.6956787109375,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 1.4969135802469136,
      "grad_norm": 0.9045893641800498,
      "kl": 0.333251953125,
      "learning_rate": 4.112281882657751e-07,
      "loss": 0.0041,
      "num_tokens": 13854758.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 763.9375,
      "completions/mean_terminated_length": 691.1199951171875,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 1.5,
      "grad_norm": 0.6837431452553143,
      "kl": 0.3494873046875,
      "learning_rate": 4.1084578758842714e-07,
      "loss": -0.0309,
      "num_tokens": 13885544.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 841.75,
      "completions/mean_terminated_length": 758.9091186523438,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 1.5030864197530864,
      "grad_norm": 0.5426373015151418,
      "kl": 0.3328857421875,
      "learning_rate": 4.104627436949559e-07,
      "loss": 0.0041,
      "num_tokens": 13919500.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 718.625,
      "completions/mean_terminated_length": 675.0,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 1.5061728395061729,
      "grad_norm": 0.03333683949432546,
      "kl": 0.3377685546875,
      "learning_rate": 4.1007905811713915e-07,
      "loss": 0.0003,
      "num_tokens": 13949272.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 731.40625,
      "completions/mean_terminated_length": 677.2222290039062,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 1.5092592592592593,
      "grad_norm": 0.8560193854362351,
      "kl": 0.319091796875,
      "learning_rate": 4.096947323893209e-07,
      "loss": 0.0182,
      "num_tokens": 13979333.0,
      "reward": 0.0,
      "reward_std": 0.15134452283382416,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 814.375,
      "completions/mean_terminated_length": 732.3478393554688,
      "completions/min_length": 513.0,
      "completions/min_terminated_length": 513.0,
      "epoch": 1.5123456790123457,
      "grad_norm": 0.7320888639123382,
      "kl": 0.338134765625,
      "learning_rate": 4.0930976804840487e-07,
      "loss": 0.013,
      "num_tokens": 14011745.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 751.90625,
      "completions/mean_terminated_length": 675.719970703125,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 1.515432098765432,
      "grad_norm": 1.1634269362480991,
      "kl": 0.3232421875,
      "learning_rate": 4.0892416663384874e-07,
      "loss": -0.0086,
      "num_tokens": 14042430.0,
      "reward": 0.0,
      "reward_std": 0.18555021286010742,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 770.28125,
      "completions/mean_terminated_length": 671.0,
      "completions/min_length": 304.0,
      "completions/min_terminated_length": 304.0,
      "epoch": 1.5185185185185186,
      "grad_norm": 0.9283628572890374,
      "kl": 0.342041015625,
      "learning_rate": 4.0853792968765765e-07,
      "loss": -0.0245,
      "num_tokens": 14073871.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.04620163142681122,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 742.65625,
      "completions/mean_terminated_length": 677.7307739257812,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 1.5216049382716048,
      "grad_norm": 0.007932142220837816,
      "kl": 0.312255859375,
      "learning_rate": 4.081510587543784e-07,
      "loss": 0.0003,
      "num_tokens": 14104008.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 711.0,
      "completions/mean_terminated_length": 690.1333618164062,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 1.5246913580246915,
      "grad_norm": 0.48836777635870254,
      "kl": 0.34423828125,
      "learning_rate": 4.0776355538109285e-07,
      "loss": 0.0342,
      "num_tokens": 14132764.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 746.09375,
      "completions/mean_terminated_length": 681.9615478515625,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 1.5277777777777777,
      "grad_norm": 0.7484600599280975,
      "kl": 0.343017578125,
      "learning_rate": 4.073754211174123e-07,
      "loss": -0.02,
      "num_tokens": 14162819.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 804.84375,
      "completions/mean_terminated_length": 705.227294921875,
      "completions/min_length": 290.0,
      "completions/min_terminated_length": 290.0,
      "epoch": 1.5308641975308643,
      "grad_norm": 0.011511269890448983,
      "kl": 0.3104248046875,
      "learning_rate": 4.069866575154706e-07,
      "loss": 0.0003,
      "num_tokens": 14195318.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 801.3125,
      "completions/mean_terminated_length": 714.1739501953125,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 1.5339506172839505,
      "grad_norm": 0.41349112183922404,
      "kl": 0.3045654296875,
      "learning_rate": 4.0659726612991853e-07,
      "loss": 0.03,
      "num_tokens": 14227284.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 746.71875,
      "completions/mean_terminated_length": 682.7307739257812,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 1.5370370370370372,
      "grad_norm": 1.0345874828431705,
      "kl": 0.3349609375,
      "learning_rate": 4.062072485179172e-07,
      "loss": -0.0447,
      "num_tokens": 14257171.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 724.40625,
      "completions/mean_terminated_length": 681.607177734375,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 1.5401234567901234,
      "grad_norm": 0.9216301631012608,
      "kl": 0.306640625,
      "learning_rate": 4.0581660623913216e-07,
      "loss": -0.0125,
      "num_tokens": 14287008.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 718.78125,
      "completions/mean_terminated_length": 675.1785888671875,
      "completions/min_length": 340.0,
      "completions/min_terminated_length": 340.0,
      "epoch": 1.5432098765432098,
      "grad_norm": 0.008509882765931261,
      "kl": 0.301513671875,
      "learning_rate": 4.0542534085572677e-07,
      "loss": 0.0003,
      "num_tokens": 14316409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 816.28125,
      "completions/mean_terminated_length": 758.1199951171875,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 1.5462962962962963,
      "grad_norm": 0.8310320267661888,
      "kl": 0.3087158203125,
      "learning_rate": 4.050334539323563e-07,
      "loss": -0.0397,
      "num_tokens": 14348938.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 758.40625,
      "completions/mean_terminated_length": 709.2222290039062,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 1.5493827160493827,
      "grad_norm": 0.5701134440013239,
      "kl": 0.3228759765625,
      "learning_rate": 4.046409470361615e-07,
      "loss": 0.0109,
      "num_tokens": 14379459.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 829.0625,
      "completions/mean_terminated_length": 752.7825927734375,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 1.5524691358024691,
      "grad_norm": 0.9379343393898524,
      "kl": 0.357177734375,
      "learning_rate": 4.0424782173676235e-07,
      "loss": 0.0006,
      "num_tokens": 14413025.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.13805748522281647,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 776.5625,
      "completions/mean_terminated_length": 719.4615478515625,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 1.5555555555555556,
      "grad_norm": 0.008022671684356462,
      "kl": 0.30859375,
      "learning_rate": 4.0385407960625185e-07,
      "loss": 0.0003,
      "num_tokens": 14444223.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 813.5,
      "completions/mean_terminated_length": 731.1304321289062,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.558641975308642,
      "grad_norm": 0.6535608066536647,
      "kl": 0.2850341796875,
      "learning_rate": 4.034597222191896e-07,
      "loss": 0.0175,
      "num_tokens": 14477047.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 732.5625,
      "completions/mean_terminated_length": 635.4166870117188,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 1.5617283950617284,
      "grad_norm": 1.3287593993080316,
      "kl": 0.30810546875,
      "learning_rate": 4.030647511525956e-07,
      "loss": -0.0387,
      "num_tokens": 14506753.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18145698308944702,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 751.8125,
      "completions/mean_terminated_length": 689.0,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 1.5648148148148149,
      "grad_norm": 1.3456393758009917,
      "kl": 0.359619140625,
      "learning_rate": 4.0266916798594417e-07,
      "loss": -0.0362,
      "num_tokens": 14537319.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.06432675570249557,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 764.4375,
      "completions/mean_terminated_length": 691.760009765625,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 1.567901234567901,
      "grad_norm": 1.1420903046945365,
      "kl": 0.344970703125,
      "learning_rate": 4.02272974301157e-07,
      "loss": -0.041,
      "num_tokens": 14568101.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.06432675570249557,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 684.59375,
      "completions/mean_terminated_length": 649.4827270507812,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 1.5709876543209877,
      "grad_norm": 1.424125050178182,
      "kl": 0.3287353515625,
      "learning_rate": 4.018761716825974e-07,
      "loss": -0.0406,
      "num_tokens": 14596052.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14842106401920319,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 739.375,
      "completions/mean_terminated_length": 659.6799926757812,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 1.574074074074074,
      "grad_norm": 0.5340550821616038,
      "kl": 0.3369140625,
      "learning_rate": 4.014787617170639e-07,
      "loss": -0.0007,
      "num_tokens": 14625908.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 789.0,
      "completions/mean_terminated_length": 665.90478515625,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 1.5771604938271606,
      "grad_norm": 1.642133768672505,
      "kl": 0.3233642578125,
      "learning_rate": 4.010807459937836e-07,
      "loss": -0.0763,
      "num_tokens": 14658096.0,
      "reward": -3.259629011154175e-09,
      "reward_std": 0.1841180920600891,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 737.8125,
      "completions/mean_terminated_length": 696.9285888671875,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 1.5802469135802468,
      "grad_norm": 0.991364149264096,
      "kl": 0.3172607421875,
      "learning_rate": 4.006821261044061e-07,
      "loss": -0.0026,
      "num_tokens": 14687938.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 879.0,
      "completions/mean_length": 664.59375,
      "completions/mean_terminated_length": 640.6333618164062,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 1.5833333333333335,
      "grad_norm": 1.0790011938421236,
      "kl": 0.28271484375,
      "learning_rate": 4.002829036429971e-07,
      "loss": -0.0612,
      "num_tokens": 14715365.0,
      "reward": 0.0,
      "reward_std": 0.15626253187656403,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 807.03125,
      "completions/mean_terminated_length": 722.1304321289062,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 1.5864197530864197,
      "grad_norm": 0.5004503459877179,
      "kl": 0.3203125,
      "learning_rate": 3.998830802060317e-07,
      "loss": 0.011,
      "num_tokens": 14748206.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 675.125,
      "completions/mean_terminated_length": 663.8709716796875,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "epoch": 1.5895061728395061,
      "grad_norm": 0.8693506099410626,
      "kl": 0.315673828125,
      "learning_rate": 3.994826573923886e-07,
      "loss": 0.0045,
      "num_tokens": 14776182.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 778.0,
      "completions/mean_terminated_length": 696.0,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 1.5925925925925926,
      "grad_norm": 1.365680137710999,
      "kl": 0.3133544921875,
      "learning_rate": 3.9908163680334326e-07,
      "loss": 0.0466,
      "num_tokens": 14807814.0,
      "reward": 0.0,
      "reward_std": 0.15241660177707672,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 791.0,
      "completions/mean_terminated_length": 685.0909423828125,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 1.595679012345679,
      "grad_norm": 0.8125239703636703,
      "kl": 0.3349609375,
      "learning_rate": 3.9868002004256165e-07,
      "loss": -0.0124,
      "num_tokens": 14839902.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 678.15625,
      "completions/mean_terminated_length": 614.1111450195312,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 1.5987654320987654,
      "grad_norm": 1.1162676468006334,
      "kl": 0.34130859375,
      "learning_rate": 3.982778087160935e-07,
      "loss": -0.002,
      "num_tokens": 14867803.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.17414312064647675,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 706.9375,
      "completions/mean_terminated_length": 661.6428833007812,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 1.6018518518518519,
      "grad_norm": 0.9426113162928746,
      "kl": 0.337890625,
      "learning_rate": 3.9787500443236664e-07,
      "loss": 0.0316,
      "num_tokens": 14897001.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 730.125,
      "completions/mean_terminated_length": 632.1666870117188,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 1.6049382716049383,
      "grad_norm": 1.2663004119632248,
      "kl": 0.3106689453125,
      "learning_rate": 3.9747160880217994e-07,
      "loss": 0.0168,
      "num_tokens": 14927069.0,
      "reward": 0.0,
      "reward_std": 0.1774374544620514,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 716.90625,
      "completions/mean_terminated_length": 660.0370483398438,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 1.6080246913580247,
      "grad_norm": 0.7914569497747325,
      "kl": 0.3240966796875,
      "learning_rate": 3.9706762343869705e-07,
      "loss": -0.0565,
      "num_tokens": 14956450.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 713.71875,
      "completions/mean_terminated_length": 693.0333862304688,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 1.6111111111111112,
      "grad_norm": 0.7631646064641017,
      "kl": 0.316162109375,
      "learning_rate": 3.966630499574397e-07,
      "loss": -0.0269,
      "num_tokens": 14985545.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.02981424145400524,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 734.0,
      "completions/mean_terminated_length": 637.3333740234375,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 1.6141975308641974,
      "grad_norm": 0.8161152833021108,
      "kl": 0.349365234375,
      "learning_rate": 3.9625788997628196e-07,
      "loss": -0.0286,
      "num_tokens": 15015301.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 711.125,
      "completions/mean_terminated_length": 638.923095703125,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 1.617283950617284,
      "grad_norm": 0.028883129384840816,
      "kl": 0.370361328125,
      "learning_rate": 3.958521451154428e-07,
      "loss": 0.0004,
      "num_tokens": 15044109.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 700.5625,
      "completions/mean_terminated_length": 654.357177734375,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 1.6203703703703702,
      "grad_norm": 0.5909773514766553,
      "kl": 0.3387451171875,
      "learning_rate": 3.954458169974805e-07,
      "loss": 0.0013,
      "num_tokens": 15072655.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 769.40625,
      "completions/mean_terminated_length": 684.5416870117188,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 1.623456790123457,
      "grad_norm": 0.016783433466779622,
      "kl": 0.3369140625,
      "learning_rate": 3.950389072472855e-07,
      "loss": 0.0003,
      "num_tokens": 15104216.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 687.125,
      "completions/mean_terminated_length": 664.6666870117188,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 1.626543209876543,
      "grad_norm": 0.8239224947604685,
      "kl": 0.2880859375,
      "learning_rate": 3.9463141749207425e-07,
      "loss": -0.0072,
      "num_tokens": 15132948.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 774.96875,
      "completions/mean_terminated_length": 705.239990234375,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 1.6296296296296298,
      "grad_norm": 1.238861288818675,
      "kl": 0.346923828125,
      "learning_rate": 3.9422334936138255e-07,
      "loss": -0.0617,
      "num_tokens": 15163967.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 783.40625,
      "completions/mean_terminated_length": 674.0454711914062,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 1.632716049382716,
      "grad_norm": 0.9515951609164494,
      "kl": 0.2908935546875,
      "learning_rate": 3.938147044870594e-07,
      "loss": -0.0016,
      "num_tokens": 15196000.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 771.65625,
      "completions/mean_terminated_length": 672.9130859375,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 1.6358024691358026,
      "grad_norm": 0.7980404553042689,
      "kl": 0.323974609375,
      "learning_rate": 3.934054845032598e-07,
      "loss": -0.0048,
      "num_tokens": 15227053.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 530
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 756.71875,
      "completions/mean_terminated_length": 667.625,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 1.6388888888888888,
      "grad_norm": 0.007225599201778235,
      "kl": NaN,
      "learning_rate": 3.9299569104643876e-07,
      "loss": 0.0003,
      "num_tokens": 15258404.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 741.96875,
      "completions/mean_terminated_length": 701.6785888671875,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 1.6419753086419753,
      "grad_norm": 0.008925163796735084,
      "kl": 0.3076171875,
      "learning_rate": 3.925853257553445e-07,
      "loss": 0.0003,
      "num_tokens": 15288595.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 763.75,
      "completions/mean_terminated_length": 726.5714721679688,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 1.6450617283950617,
      "grad_norm": 1.3739270666570305,
      "kl": 0.3138427734375,
      "learning_rate": 3.921743902710122e-07,
      "loss": -0.0597,
      "num_tokens": 15319635.0,
      "reward": 0.0,
      "reward_std": 0.15000322461128235,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 729.25,
      "completions/mean_terminated_length": 646.719970703125,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 1.6481481481481481,
      "grad_norm": 0.9423901024229169,
      "kl": 0.34130859375,
      "learning_rate": 3.917628862367569e-07,
      "loss": 0.0212,
      "num_tokens": 15349663.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.11199356615543365,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 816.5,
      "completions/mean_terminated_length": 707.8095092773438,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 1.6512345679012346,
      "grad_norm": 1.0973894313986772,
      "kl": 0.2958984375,
      "learning_rate": 3.913508152981674e-07,
      "loss": -0.0598,
      "num_tokens": 15382031.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 740.53125,
      "completions/mean_terminated_length": 661.1599731445312,
      "completions/min_length": 340.0,
      "completions/min_terminated_length": 340.0,
      "epoch": 1.654320987654321,
      "grad_norm": 0.017496266324640054,
      "kl": 0.330810546875,
      "learning_rate": 3.909381791030998e-07,
      "loss": 0.0003,
      "num_tokens": 15411776.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 767.96875,
      "completions/mean_terminated_length": 682.625,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 1.6574074074074074,
      "grad_norm": 1.0650048112777157,
      "kl": 0.3212890625,
      "learning_rate": 3.905249793016702e-07,
      "loss": -0.0223,
      "num_tokens": 15442927.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09598580002784729,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 788.25,
      "completions/mean_terminated_length": 722.239990234375,
      "completions/min_length": 511.0,
      "completions/min_terminated_length": 511.0,
      "epoch": 1.6604938271604939,
      "grad_norm": 0.7107384859454141,
      "kl": 0.2861328125,
      "learning_rate": 3.9011121754624865e-07,
      "loss": 0.014,
      "num_tokens": 15474435.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 745.125,
      "completions/mean_terminated_length": 693.4815063476562,
      "completions/min_length": 295.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 1.6635802469135803,
      "grad_norm": 0.008815093813455377,
      "kl": 0.3265380859375,
      "learning_rate": 3.8969689549145266e-07,
      "loss": 0.0003,
      "num_tokens": 15505515.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 840.0,
      "completions/mean_length": 673.625,
      "completions/mean_terminated_length": 623.5714721679688,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 1.6666666666666665,
      "grad_norm": 0.021289576005990775,
      "kl": 0.366943359375,
      "learning_rate": 3.8928201479414024e-07,
      "loss": 0.0004,
      "num_tokens": 15533119.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 683.21875,
      "completions/mean_terminated_length": 634.5357666015625,
      "completions/min_length": 304.0,
      "completions/min_terminated_length": 304.0,
      "epoch": 1.6697530864197532,
      "grad_norm": 0.7322246144769231,
      "kl": 0.314697265625,
      "learning_rate": 3.888665771134032e-07,
      "loss": -0.0059,
      "num_tokens": 15561374.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 847.65625,
      "completions/mean_terminated_length": 710.5,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 1.6728395061728394,
      "grad_norm": 1.106478983479238,
      "kl": 0.27490234375,
      "learning_rate": 3.8845058411056095e-07,
      "loss": -0.0571,
      "num_tokens": 15595731.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.11199356615543365,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 774.1875,
      "completions/mean_terminated_length": 676.434814453125,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 1.675925925925926,
      "grad_norm": 1.1423664079438094,
      "kl": 0.320556640625,
      "learning_rate": 3.880340374491535e-07,
      "loss": -0.0267,
      "num_tokens": 15627549.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 742.875,
      "completions/mean_terminated_length": 664.1599731445312,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 1.6790123456790123,
      "grad_norm": 0.8397405428454164,
      "kl": 0.352783203125,
      "learning_rate": 3.8761693879493495e-07,
      "loss": -0.061,
      "num_tokens": 15657561.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 845.125,
      "completions/mean_terminated_length": 737.7999877929688,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 1.682098765432099,
      "grad_norm": 0.009647961507256286,
      "kl": 0.313720703125,
      "learning_rate": 3.871992898158667e-07,
      "loss": 0.0003,
      "num_tokens": 15691197.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 780.625,
      "completions/mean_terminated_length": 724.4615478515625,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 1.6851851851851851,
      "grad_norm": 0.9076764410303536,
      "kl": 0.2767333984375,
      "learning_rate": 3.867810921821112e-07,
      "loss": -0.0132,
      "num_tokens": 15722969.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 770.5625,
      "completions/mean_terminated_length": 712.0769653320312,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 1.6882716049382716,
      "grad_norm": 1.1350196498890335,
      "kl": 0.3204345703125,
      "learning_rate": 3.863623475660245e-07,
      "loss": 0.0176,
      "num_tokens": 15754063.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 793.4375,
      "completions/mean_terminated_length": 728.8800048828125,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 1.691358024691358,
      "grad_norm": 0.7098311299876164,
      "kl": 0.3018798828125,
      "learning_rate": 3.859430576421503e-07,
      "loss": -0.0029,
      "num_tokens": 15785921.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 744.84375,
      "completions/mean_terminated_length": 715.9655151367188,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 1.6944444444444444,
      "grad_norm": 0.8675803468787185,
      "kl": 0.29931640625,
      "learning_rate": 3.855232240872128e-07,
      "loss": 0.03,
      "num_tokens": 15816460.0,
      "reward": 0.0,
      "reward_std": 0.13003921508789062,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 884.0,
      "completions/mean_length": 693.9375,
      "completions/mean_terminated_length": 646.7857666015625,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 1.6975308641975309,
      "grad_norm": 1.3689599144314457,
      "kl": 0.35400390625,
      "learning_rate": 3.851028485801105e-07,
      "loss": -0.0666,
      "num_tokens": 15844722.0,
      "reward": 0.028124993667006493,
      "reward_std": 0.09598580002784729,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 801.03125,
      "completions/mean_terminated_length": 726.7083740234375,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 1.7006172839506173,
      "grad_norm": 0.8281656840707402,
      "kl": 0.28076171875,
      "learning_rate": 3.8468193280190864e-07,
      "loss": -0.0107,
      "num_tokens": 15876527.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 701.84375,
      "completions/mean_terminated_length": 655.8214721679688,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 1.7037037037037037,
      "grad_norm": 0.00877201557433776,
      "kl": 0.335205078125,
      "learning_rate": 3.842604784358333e-07,
      "loss": 0.0003,
      "num_tokens": 15905018.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 796.9375,
      "completions/mean_terminated_length": 721.25,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 1.7067901234567902,
      "grad_norm": 0.5638693262466036,
      "kl": 0.327880859375,
      "learning_rate": 3.8383848716726444e-07,
      "loss": 0.0316,
      "num_tokens": 15936972.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 766.90625,
      "completions/mean_terminated_length": 719.2963256835938,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 1.7098765432098766,
      "grad_norm": 1.2138127334164506,
      "kl": 0.320556640625,
      "learning_rate": 3.8341596068372874e-07,
      "loss": 0.0101,
      "num_tokens": 15967969.0,
      "reward": 0.0,
      "reward_std": 0.15677303075790405,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 739.75,
      "completions/mean_terminated_length": 699.1428833007812,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 1.7129629629629628,
      "grad_norm": 0.7032138830186002,
      "kl": 0.32177734375,
      "learning_rate": 3.829929006748934e-07,
      "loss": -0.0115,
      "num_tokens": 15998525.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 840.09375,
      "completions/mean_terminated_length": 768.1304321289062,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 1.7160493827160495,
      "grad_norm": 0.008068159736726943,
      "kl": 0.2872314453125,
      "learning_rate": 3.8256930883255927e-07,
      "loss": 0.0003,
      "num_tokens": 16032160.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 736.59375,
      "completions/mean_terminated_length": 656.1199951171875,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 1.7191358024691357,
      "grad_norm": 0.6726821694845868,
      "kl": 0.3104248046875,
      "learning_rate": 3.8214518685065377e-07,
      "loss": -0.0004,
      "num_tokens": 16062155.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 795.03125,
      "completions/mean_terminated_length": 742.1923217773438,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 1.7222222222222223,
      "grad_norm": 0.8514285385130014,
      "kl": 0.327392578125,
      "learning_rate": 3.817205364252244e-07,
      "loss": -0.0153,
      "num_tokens": 16093984.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 787.96875,
      "completions/mean_terminated_length": 721.8800048828125,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 1.7253086419753085,
      "grad_norm": 0.9523489909182068,
      "kl": 0.33056640625,
      "learning_rate": 3.8129535925443187e-07,
      "loss": -0.0247,
      "num_tokens": 16125891.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 771.625,
      "completions/mean_terminated_length": 700.9599609375,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 1.7283950617283952,
      "grad_norm": 1.0422373945268346,
      "kl": 0.34423828125,
      "learning_rate": 3.8086965703854336e-07,
      "loss": -0.0144,
      "num_tokens": 16157119.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18104533851146698,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 797.25,
      "completions/mean_terminated_length": 733.760009765625,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 1.7314814814814814,
      "grad_norm": 0.007672943135312286,
      "kl": 0.335693359375,
      "learning_rate": 3.8044343147992563e-07,
      "loss": 0.0003,
      "num_tokens": 16189839.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 796.34375,
      "completions/mean_terminated_length": 720.4583740234375,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 1.734567901234568,
      "grad_norm": 0.7619376986860051,
      "kl": 0.320556640625,
      "learning_rate": 3.8001668428303847e-07,
      "loss": 0.0218,
      "num_tokens": 16222026.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 769.59375,
      "completions/mean_terminated_length": 733.2500610351562,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 1.7376543209876543,
      "grad_norm": 0.014792220069454454,
      "kl": 0.342041015625,
      "learning_rate": 3.7958941715442726e-07,
      "loss": 0.0003,
      "num_tokens": 16253197.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 748.53125,
      "completions/mean_terminated_length": 671.3999633789062,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 1.7407407407407407,
      "grad_norm": 0.4933010943286016,
      "kl": 0.2974853515625,
      "learning_rate": 3.791616318027171e-07,
      "loss": 0.0099,
      "num_tokens": 16283482.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 842.96875,
      "completions/mean_terminated_length": 748.1428833007812,
      "completions/min_length": 536.0,
      "completions/min_terminated_length": 536.0,
      "epoch": 1.7438271604938271,
      "grad_norm": 0.00825346935935361,
      "kl": 0.2969970703125,
      "learning_rate": 3.78733329938605e-07,
      "loss": 0.0003,
      "num_tokens": 16317837.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 859.0,
      "completions/mean_length": 774.53125,
      "completions/mean_terminated_length": 676.9130859375,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 1.7469135802469136,
      "grad_norm": 0.7834725540881555,
      "kl": 0.307861328125,
      "learning_rate": 3.7830451327485367e-07,
      "loss": -0.007,
      "num_tokens": 16349026.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 742.03125,
      "completions/mean_terminated_length": 676.9615478515625,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 1.75,
      "grad_norm": 0.7896783411535454,
      "kl": 0.309326171875,
      "learning_rate": 3.778751835262847e-07,
      "loss": -0.0235,
      "num_tokens": 16378831.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 753.5,
      "completions/mean_terminated_length": 725.5172119140625,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 1.7530864197530864,
      "grad_norm": 0.4569321087531079,
      "kl": 0.32763671875,
      "learning_rate": 3.7744534240977085e-07,
      "loss": -0.0215,
      "num_tokens": 16409591.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 775.28125,
      "completions/mean_terminated_length": 717.8846435546875,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 1.7561728395061729,
      "grad_norm": 0.8922151507505256,
      "kl": 0.3408203125,
      "learning_rate": 3.7701499164423045e-07,
      "loss": 0.0143,
      "num_tokens": 16440616.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 830.0,
      "completions/mean_length": 761.5,
      "completions/mean_terminated_length": 658.7825927734375,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 1.7592592592592593,
      "grad_norm": 1.6746867773094058,
      "kl": 0.3203125,
      "learning_rate": 3.7658413295061974e-07,
      "loss": -0.0416,
      "num_tokens": 16471664.0,
      "reward": 0.0,
      "reward_std": 0.15466603636741638,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 823.9375,
      "completions/mean_terminated_length": 745.6521606445312,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 1.7623456790123457,
      "grad_norm": 1.330520862716196,
      "kl": 0.2933349609375,
      "learning_rate": 3.7615276805192595e-07,
      "loss": 0.0483,
      "num_tokens": 16504522.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.15880176424980164,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 747.09375,
      "completions/mean_terminated_length": 669.5599975585938,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 1.765432098765432,
      "grad_norm": 0.7477047350570117,
      "kl": 0.3272705078125,
      "learning_rate": 3.7572089867316075e-07,
      "loss": 0.0122,
      "num_tokens": 16534861.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 781.40625,
      "completions/mean_terminated_length": 700.5416870117188,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 1.7685185185185186,
      "grad_norm": 1.0866824842532468,
      "kl": 0.3203125,
      "learning_rate": 3.7528852654135323e-07,
      "loss": 0.0174,
      "num_tokens": 16566714.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.1024516224861145,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 715.1875,
      "completions/mean_terminated_length": 683.2413940429688,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 1.7716049382716048,
      "grad_norm": 1.0340351299850523,
      "kl": 0.345458984375,
      "learning_rate": 3.7485565338554294e-07,
      "loss": -0.023,
      "num_tokens": 16596320.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 816.0625,
      "completions/mean_terminated_length": 734.6956787109375,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 1.7746913580246915,
      "grad_norm": 0.6909734053595269,
      "kl": 0.352294921875,
      "learning_rate": 3.7442228093677296e-07,
      "loss": 0.0024,
      "num_tokens": 16628850.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 768.625,
      "completions/mean_terminated_length": 652.5454711914062,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 1.7777777777777777,
      "grad_norm": 0.7519408898271244,
      "kl": 0.372802734375,
      "learning_rate": 3.7398841092808307e-07,
      "loss": -0.0118,
      "num_tokens": 16660642.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 716.0,
      "completions/mean_terminated_length": 644.923095703125,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 1.7808641975308643,
      "grad_norm": 1.0225990435274304,
      "kl": 0.3367919921875,
      "learning_rate": 3.735540450945028e-07,
      "loss": 0.0025,
      "num_tokens": 16689590.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.07932206988334656,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 752.375,
      "completions/mean_terminated_length": 702.0740966796875,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 1.7839506172839505,
      "grad_norm": 0.49306929501114305,
      "kl": 0.333251953125,
      "learning_rate": 3.731191851730443e-07,
      "loss": -0.0014,
      "num_tokens": 16720414.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 805.4375,
      "completions/mean_terminated_length": 706.0909423828125,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 1.7870370370370372,
      "grad_norm": 0.9803899573700692,
      "kl": 0.295166015625,
      "learning_rate": 3.7268383290269583e-07,
      "loss": 0.0148,
      "num_tokens": 16753040.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.17674319446086884,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096889972687,
      "step": 579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 779.5625,
      "completions/mean_terminated_length": 683.9130859375,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 1.7901234567901234,
      "grad_norm": 1.1735776018585131,
      "kl": 0.317138671875,
      "learning_rate": 3.7224799002441427e-07,
      "loss": -0.0431,
      "num_tokens": 16784862.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 698.40625,
      "completions/mean_terminated_length": 638.1111450195312,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 1.7932098765432098,
      "grad_norm": 0.010817240832333085,
      "kl": 0.326171875,
      "learning_rate": 3.718116582811186e-07,
      "loss": 0.0003,
      "num_tokens": 16813619.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 776.8125,
      "completions/mean_terminated_length": 707.5999755859375,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 1.7962962962962963,
      "grad_norm": 0.5015604487434996,
      "kl": 0.3193359375,
      "learning_rate": 3.713748394176827e-07,
      "loss": 0.0167,
      "num_tokens": 16844805.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 810.40625,
      "completions/mean_terminated_length": 726.8261108398438,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 1.7993827160493827,
      "grad_norm": 0.00802844464316047,
      "kl": 0.3101806640625,
      "learning_rate": 3.7093753518092853e-07,
      "loss": 0.0003,
      "num_tokens": 16877326.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 747.90625,
      "completions/mean_terminated_length": 684.1923217773438,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 1.8024691358024691,
      "grad_norm": 1.0248104785700363,
      "kl": 0.3275146484375,
      "learning_rate": 3.704997473196187e-07,
      "loss": -0.0439,
      "num_tokens": 16907683.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 717.71875,
      "completions/mean_terminated_length": 647.0385131835938,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 1.8055555555555556,
      "grad_norm": 0.008714448696403034,
      "kl": 0.2947998046875,
      "learning_rate": 3.7006147758445017e-07,
      "loss": 0.0003,
      "num_tokens": 16936930.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 742.53125,
      "completions/mean_terminated_length": 632.3912963867188,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 1.808641975308642,
      "grad_norm": 2.231146106662793,
      "kl": 0.29833984375,
      "learning_rate": 3.696227277280467e-07,
      "loss": -0.0354,
      "num_tokens": 16967423.0,
      "reward": -3.259629011154175e-09,
      "reward_std": 0.17682674527168274,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 687.40625,
      "completions/mean_terminated_length": 676.54833984375,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 1.8117283950617284,
      "grad_norm": 1.4562902534305326,
      "kl": 0.28857421875,
      "learning_rate": 3.691834995049522e-07,
      "loss": -0.0859,
      "num_tokens": 16995588.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 812.75,
      "completions/mean_terminated_length": 702.0952758789062,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 1.8148148148148149,
      "grad_norm": 0.9753903401211428,
      "kl": 0.314697265625,
      "learning_rate": 3.687437946716234e-07,
      "loss": -0.017,
      "num_tokens": 17027744.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 800.625,
      "completions/mean_terminated_length": 666.6000366210938,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 1.817901234567901,
      "grad_norm": 0.8793951481635643,
      "kl": 0.30029296875,
      "learning_rate": 3.68303614986423e-07,
      "loss": -0.0106,
      "num_tokens": 17059916.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 764.6875,
      "completions/mean_terminated_length": 716.6666870117188,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "epoch": 1.8209876543209877,
      "grad_norm": 1.0182024225613862,
      "kl": 0.3121337890625,
      "learning_rate": 3.6786296220961277e-07,
      "loss": 0.0119,
      "num_tokens": 17090630.0,
      "reward": 0.0,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 763.375,
      "completions/mean_terminated_length": 703.2307739257812,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 1.824074074074074,
      "grad_norm": 0.4918459088357609,
      "kl": 0.308837890625,
      "learning_rate": 3.6742183810334605e-07,
      "loss": 0.0205,
      "num_tokens": 17121198.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 782.09375,
      "completions/mean_terminated_length": 701.4583740234375,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 1.8271604938271606,
      "grad_norm": 0.541075467064785,
      "kl": 0.303466796875,
      "learning_rate": 3.6698024443166134e-07,
      "loss": -0.0013,
      "num_tokens": 17152845.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 793.1875,
      "completions/mean_terminated_length": 688.2727661132812,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 1.8302469135802468,
      "grad_norm": 1.224660735592709,
      "kl": 0.2767333984375,
      "learning_rate": 3.6653818296047466e-07,
      "loss": -0.0056,
      "num_tokens": 17184691.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 720.0,
      "completions/mean_terminated_length": 663.7037353515625,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 1.8333333333333335,
      "grad_norm": 0.9040897384601864,
      "kl": 0.322509765625,
      "learning_rate": 3.660956554575729e-07,
      "loss": -0.0367,
      "num_tokens": 17213943.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.04620163142681122,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 795.78125,
      "completions/mean_terminated_length": 706.478271484375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 1.8364197530864197,
      "grad_norm": 0.830769741431794,
      "kl": 0.343505859375,
      "learning_rate": 3.656526636926065e-07,
      "loss": 0.0037,
      "num_tokens": 17246256.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 738.75,
      "completions/mean_terminated_length": 698.0000610351562,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 1.8395061728395061,
      "grad_norm": 1.057906036541086,
      "kl": 0.30712890625,
      "learning_rate": 3.652092094370826e-07,
      "loss": -0.0391,
      "num_tokens": 17276232.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15134452283382416,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 714.8125,
      "completions/mean_terminated_length": 682.8275756835938,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 1.8425925925925926,
      "grad_norm": 3.485929854350874,
      "kl": 0.3009033203125,
      "learning_rate": 3.647652944643577e-07,
      "loss": -0.2627,
      "num_tokens": 17305438.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 799.71875,
      "completions/mean_terminated_length": 697.7727661132812,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 1.845679012345679,
      "grad_norm": 0.8559240818386872,
      "kl": 0.3125,
      "learning_rate": 3.6432092054963055e-07,
      "loss": -0.0229,
      "num_tokens": 17337949.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13241708278656006,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 823.9375,
      "completions/mean_terminated_length": 719.1428833007812,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 1.8487654320987654,
      "grad_norm": 0.009840729619351467,
      "kl": 0.279052734375,
      "learning_rate": 3.638760894699355e-07,
      "loss": 0.0003,
      "num_tokens": 17370767.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 735.65625,
      "completions/mean_terminated_length": 669.1154174804688,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 1.8518518518518519,
      "grad_norm": 0.7490011171282679,
      "kl": 0.2989501953125,
      "learning_rate": 3.6343080300413497e-07,
      "loss": 0.0305,
      "num_tokens": 17400700.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 742.125,
      "completions/mean_terminated_length": 689.9259033203125,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 1.8549382716049383,
      "grad_norm": 1.4493332077099517,
      "kl": 0.276611328125,
      "learning_rate": 3.629850629329124e-07,
      "loss": -0.0752,
      "num_tokens": 17431296.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 763.96875,
      "completions/mean_terminated_length": 703.9615478515625,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 1.8580246913580247,
      "grad_norm": 0.7347059578264971,
      "kl": 0.3212890625,
      "learning_rate": 3.625388710387651e-07,
      "loss": -0.012,
      "num_tokens": 17462131.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 823.3125,
      "completions/mean_terminated_length": 718.1904907226562,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 1.8611111111111112,
      "grad_norm": 1.257451334629759,
      "kl": 0.2890625,
      "learning_rate": 3.6209222910599746e-07,
      "loss": -0.0073,
      "num_tokens": 17495037.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.15673846006393433,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 743.46875,
      "completions/mean_terminated_length": 678.7307739257812,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 1.8641975308641974,
      "grad_norm": 0.5034951058170785,
      "kl": 0.2926025390625,
      "learning_rate": 3.616451389207133e-07,
      "loss": 0.0217,
      "num_tokens": 17525484.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 773.3125,
      "completions/mean_terminated_length": 715.4615478515625,
      "completions/min_length": 554.0,
      "completions/min_terminated_length": 554.0,
      "epoch": 1.867283950617284,
      "grad_norm": 0.7588473545628874,
      "kl": 0.2994384765625,
      "learning_rate": 3.611976022708091e-07,
      "loss": 0.0124,
      "num_tokens": 17556462.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 701.53125,
      "completions/mean_terminated_length": 627.1154174804688,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 1.8703703703703702,
      "grad_norm": 0.767726931403257,
      "kl": 0.293212890625,
      "learning_rate": 3.6074962094596676e-07,
      "loss": 0.0028,
      "num_tokens": 17584959.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 771.34375,
      "completions/mean_terminated_length": 687.125,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 1.873456790123457,
      "grad_norm": 0.9241261242502699,
      "kl": 0.2957763671875,
      "learning_rate": 3.603011967376464e-07,
      "loss": -0.0056,
      "num_tokens": 17616442.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 762.09375,
      "completions/mean_terminated_length": 674.7916870117188,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 1.876543209876543,
      "grad_norm": 0.622414233832134,
      "kl": 0.3138427734375,
      "learning_rate": 3.598523314390792e-07,
      "loss": 0.0083,
      "num_tokens": 17647905.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 697.84375,
      "completions/mean_terminated_length": 676.1000366210938,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 1.8796296296296298,
      "grad_norm": 1.1736021499772744,
      "kl": 0.314453125,
      "learning_rate": 3.594030268452601e-07,
      "loss": 0.0237,
      "num_tokens": 17676520.0,
      "reward": 0.0,
      "reward_std": 0.15075348317623138,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 753.90625,
      "completions/mean_terminated_length": 703.888916015625,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 1.882716049382716,
      "grad_norm": 0.45725316166440305,
      "kl": 0.290771484375,
      "learning_rate": 3.5895328475294106e-07,
      "loss": 0.0203,
      "num_tokens": 17707237.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 705.96875,
      "completions/mean_terminated_length": 673.0689697265625,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 1.8858024691358026,
      "grad_norm": 0.8632045276371411,
      "kl": 0.30810546875,
      "learning_rate": 3.585031069606234e-07,
      "loss": -0.0005,
      "num_tokens": 17736384.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 694.03125,
      "completions/mean_terminated_length": 659.8965454101562,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 1.8888888888888888,
      "grad_norm": 0.6344068055819815,
      "kl": 0.3033447265625,
      "learning_rate": 3.5805249526855074e-07,
      "loss": 0.0159,
      "num_tokens": 17764777.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 651.09375,
      "completions/mean_terminated_length": 639.0645141601562,
      "completions/min_length": 300.0,
      "completions/min_terminated_length": 300.0,
      "epoch": 1.8919753086419753,
      "grad_norm": 0.8866730404607887,
      "kl": 0.3212890625,
      "learning_rate": 3.5760145147870204e-07,
      "loss": -0.0119,
      "num_tokens": 17791632.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 760.3125,
      "completions/mean_terminated_length": 686.47998046875,
      "completions/min_length": 536.0,
      "completions/min_terminated_length": 536.0,
      "epoch": 1.8950617283950617,
      "grad_norm": 0.7948213609530317,
      "kl": 0.28515625,
      "learning_rate": 3.571499773947839e-07,
      "loss": -0.0038,
      "num_tokens": 17822462.0,
      "reward": 0.0,
      "reward_std": 0.15908902883529663,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 789.25,
      "completions/mean_terminated_length": 682.5454711914062,
      "completions/min_length": 525.0,
      "completions/min_terminated_length": 525.0,
      "epoch": 1.8981481481481481,
      "grad_norm": 0.564664960370857,
      "kl": 0.289794921875,
      "learning_rate": 3.5669807482222395e-07,
      "loss": -0.012,
      "num_tokens": 17854338.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 763.125,
      "completions/mean_terminated_length": 702.923095703125,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 1.9012345679012346,
      "grad_norm": 0.5785058641997257,
      "kl": 0.2777099609375,
      "learning_rate": 3.562457455681633e-07,
      "loss": 0.0045,
      "num_tokens": 17885690.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 659.125,
      "completions/mean_terminated_length": 634.800048828125,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 1.904320987654321,
      "grad_norm": 0.529554309890252,
      "kl": 0.284912109375,
      "learning_rate": 3.557929914414491e-07,
      "loss": 0.0019,
      "num_tokens": 17912774.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 720.34375,
      "completions/mean_terminated_length": 676.9642944335938,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 1.9074074074074074,
      "grad_norm": 1.36796285909082,
      "kl": 0.330810546875,
      "learning_rate": 3.553398142526277e-07,
      "loss": -0.0637,
      "num_tokens": 17941565.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.21052932739257812,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 776.40625,
      "completions/mean_terminated_length": 663.8636474609375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 1.9104938271604939,
      "grad_norm": 1.223456191622778,
      "kl": 0.2587890625,
      "learning_rate": 3.5488621581393736e-07,
      "loss": 0.019,
      "num_tokens": 17972954.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 738.90625,
      "completions/mean_terminated_length": 673.1154174804688,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 1.9135802469135803,
      "grad_norm": 0.9181830067181223,
      "kl": 0.4039306640625,
      "learning_rate": 3.5443219793930073e-07,
      "loss": 0.0137,
      "num_tokens": 18003263.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 859.6875,
      "completions/mean_terminated_length": 731.888916015625,
      "completions/min_length": 498.0,
      "completions/min_terminated_length": 498.0,
      "epoch": 1.9166666666666665,
      "grad_norm": 1.162305257769272,
      "kl": 0.2412109375,
      "learning_rate": 3.5397776244431794e-07,
      "loss": 0.033,
      "num_tokens": 18037797.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.2121918946504593,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -6.51925802230835e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 771.8125,
      "completions/mean_terminated_length": 713.6154174804688,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 1.9197530864197532,
      "grad_norm": 1.0754812877536444,
      "kl": 0.293212890625,
      "learning_rate": 3.535229111462589e-07,
      "loss": 0.0009,
      "num_tokens": 18069319.0,
      "reward": 0.0,
      "reward_std": 0.18565019965171814,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 795.53125,
      "completions/mean_terminated_length": 719.375,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 1.9228395061728394,
      "grad_norm": 1.418906680488901,
      "kl": 0.246337890625,
      "learning_rate": 3.530676458640567e-07,
      "loss": 0.079,
      "num_tokens": 18101908.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 764.96875,
      "completions/mean_terminated_length": 692.4400024414062,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 1.925925925925926,
      "grad_norm": 0.6694783362066277,
      "kl": 0.2960205078125,
      "learning_rate": 3.5261196841829957e-07,
      "loss": 0.0058,
      "num_tokens": 18132931.0,
      "reward": 0.0,
      "reward_std": 0.12547743320465088,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 744.375,
      "completions/mean_terminated_length": 679.84619140625,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 1.9290123456790123,
      "grad_norm": 1.067940047943812,
      "kl": 0.31640625,
      "learning_rate": 3.521558806312241e-07,
      "loss": -0.0168,
      "num_tokens": 18162823.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 756.375,
      "completions/mean_terminated_length": 706.8148193359375,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 1.932098765432099,
      "grad_norm": 0.7395763172607944,
      "kl": 0.279052734375,
      "learning_rate": 3.5169938432670775e-07,
      "loss": 0.0014,
      "num_tokens": 18193815.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 657.1875,
      "completions/mean_terminated_length": 645.3547973632812,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 1.9351851851851851,
      "grad_norm": 0.7412616344854341,
      "kl": 0.2867431640625,
      "learning_rate": 3.5124248133026187e-07,
      "loss": 0.0426,
      "num_tokens": 18221013.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 736.21875,
      "completions/mean_terminated_length": 695.107177734375,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 1.9382716049382716,
      "grad_norm": 1.3118440074292663,
      "kl": 0.2803955078125,
      "learning_rate": 3.5078517346902384e-07,
      "loss": -0.0234,
      "num_tokens": 18251540.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.22114333510398865,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 803.125,
      "completions/mean_terminated_length": 687.4285888671875,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 1.941358024691358,
      "grad_norm": 0.824049243789038,
      "kl": 0.2554931640625,
      "learning_rate": 3.503274625717504e-07,
      "loss": 0.0075,
      "num_tokens": 18283848.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 676.9375,
      "completions/mean_terminated_length": 627.357177734375,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 1.9444444444444444,
      "grad_norm": 0.8005612327423927,
      "kl": 0.309814453125,
      "learning_rate": 3.498693504688097e-07,
      "loss": 0.04,
      "num_tokens": 18311882.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 723.8125,
      "completions/mean_terminated_length": 654.5385131835938,
      "completions/min_length": 292.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 1.9475308641975309,
      "grad_norm": 0.968269644106719,
      "kl": 0.326416015625,
      "learning_rate": 3.494108389921744e-07,
      "loss": -0.0321,
      "num_tokens": 18341316.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 793.46875,
      "completions/mean_terminated_length": 703.2608642578125,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 1.9506172839506173,
      "grad_norm": 1.227757361012959,
      "kl": 0.2994384765625,
      "learning_rate": 3.4895192997541436e-07,
      "loss": 0.0301,
      "num_tokens": 18373583.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.19315354526042938,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 709.09375,
      "completions/mean_terminated_length": 688.1000366210938,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 1.9537037037037037,
      "grad_norm": 1.0844100134163452,
      "kl": 0.278564453125,
      "learning_rate": 3.484926252536891e-07,
      "loss": 0.0508,
      "num_tokens": 18402806.0,
      "reward": 0.0,
      "reward_std": 0.1586425006389618,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 732.71875,
      "completions/mean_terminated_length": 665.5,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 1.9567901234567902,
      "grad_norm": 0.011084886665632733,
      "kl": 0.2913818359375,
      "learning_rate": 3.4803292666374047e-07,
      "loss": 0.0003,
      "num_tokens": 18432697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 622.65625,
      "completions/mean_terminated_length": 595.9000244140625,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 1.9598765432098766,
      "grad_norm": 1.6160733763497246,
      "kl": 0.3094482421875,
      "learning_rate": 3.4757283604388546e-07,
      "loss": -0.0094,
      "num_tokens": 18458722.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.18118225038051605,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 734.40625,
      "completions/mean_terminated_length": 637.875,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 1.9629629629629628,
      "grad_norm": 1.7914416516037617,
      "kl": 0.274658203125,
      "learning_rate": 3.47112355234009e-07,
      "loss": 0.0618,
      "num_tokens": 18488591.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.2114925980567932,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096293926239,
      "step": 636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 762.71875,
      "completions/mean_terminated_length": 714.3333129882812,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 1.9660493827160495,
      "grad_norm": 1.0571098256792673,
      "kl": 0.248291015625,
      "learning_rate": 3.466514860755559e-07,
      "loss": 0.0368,
      "num_tokens": 18519174.0,
      "reward": 1.3969838619232178e-09,
      "reward_std": 0.14951469004154205,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 640.75,
      "completions/mean_terminated_length": 586.0,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 1.9691358024691357,
      "grad_norm": 1.0766277591693325,
      "kl": 0.28857421875,
      "learning_rate": 3.4619023041152433e-07,
      "loss": 0.028,
      "num_tokens": 18545930.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18108326196670532,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 743.375,
      "completions/mean_terminated_length": 649.8333740234375,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 1.9722222222222223,
      "grad_norm": 0.6608561339989446,
      "kl": 0.2994384765625,
      "learning_rate": 3.4572859008645796e-07,
      "loss": 0.0008,
      "num_tokens": 18576034.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 770.6875,
      "completions/mean_terminated_length": 712.2307739257812,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.9753086419753085,
      "grad_norm": 1.3288545908857277,
      "kl": 0.257568359375,
      "learning_rate": 3.452665669464386e-07,
      "loss": 0.0173,
      "num_tokens": 18607152.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.20140354335308075,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 749.34375,
      "completions/mean_terminated_length": 657.7916870117188,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 1.9783950617283952,
      "grad_norm": 0.9487511446974353,
      "kl": 0.3446044921875,
      "learning_rate": 3.448041628390791e-07,
      "loss": -0.0056,
      "num_tokens": 18638011.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 682.15625,
      "completions/mean_terminated_length": 633.3214721679688,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 1.9814814814814814,
      "grad_norm": 1.1936306191580994,
      "kl": 0.3072509765625,
      "learning_rate": 3.443413796135159e-07,
      "loss": 0.0044,
      "num_tokens": 18666196.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 721.9375,
      "completions/mean_terminated_length": 621.25,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 1.984567901234568,
      "grad_norm": 0.024297146526663496,
      "kl": 0.293212890625,
      "learning_rate": 3.4387821912040116e-07,
      "loss": 0.0003,
      "num_tokens": 18695778.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 801.34375,
      "completions/mean_terminated_length": 700.1364135742188,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 1.9876543209876543,
      "grad_norm": 0.8239805671743939,
      "kl": 0.2957763671875,
      "learning_rate": 3.4341468321189574e-07,
      "loss": -0.013,
      "num_tokens": 18728109.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 789.8125,
      "completions/mean_terminated_length": 667.1428833007812,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 1.9907407407407407,
      "grad_norm": 1.135707835543851,
      "kl": 0.32373046875,
      "learning_rate": 3.4295077374166214e-07,
      "loss": -0.0133,
      "num_tokens": 18760591.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 667.0625,
      "completions/mean_terminated_length": 630.137939453125,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 1.9938271604938271,
      "grad_norm": 0.9644947395422476,
      "kl": 0.3138427734375,
      "learning_rate": 3.4248649256485655e-07,
      "loss": -0.0243,
      "num_tokens": 18788005.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 728.34375,
      "completions/mean_terminated_length": 660.1154174804688,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 1.9969135802469136,
      "grad_norm": 0.8541538616070171,
      "kl": 0.30078125,
      "learning_rate": 3.4202184153812135e-07,
      "loss": -0.0279,
      "num_tokens": 18817956.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.12777692079544067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 752.0,
      "completions/mean_terminated_length": 675.8399658203125,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 2.0,
      "grad_norm": 1.109871309325268,
      "kl": 0.3179931640625,
      "learning_rate": 3.415568225195783e-07,
      "loss": -0.0034,
      "num_tokens": 18848804.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.18632806837558746,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 739.25,
      "completions/mean_terminated_length": 609.8181762695312,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 2.003086419753086,
      "grad_norm": 1.4385743455681783,
      "kl": 0.2890625,
      "learning_rate": 3.410914373688205e-07,
      "loss": -0.0302,
      "num_tokens": 18879168.0,
      "reward": 0.0,
      "reward_std": 0.23120412230491638,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 697.21875,
      "completions/mean_terminated_length": 650.5357666015625,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 2.006172839506173,
      "grad_norm": 1.5648401892985397,
      "kl": 0.280029296875,
      "learning_rate": 3.4062568794690536e-07,
      "loss": -0.0508,
      "num_tokens": 18907779.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.2660294771194458,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096293926239,
      "step": 650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 741.09375,
      "completions/mean_terminated_length": 675.8077392578125,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 2.009259259259259,
      "grad_norm": 0.9815139820291894,
      "kl": 0.295654296875,
      "learning_rate": 3.401595761163468e-07,
      "loss": -0.0218,
      "num_tokens": 18938862.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 699.78125,
      "completions/mean_terminated_length": 639.74072265625,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 2.0123456790123457,
      "grad_norm": 0.9115095196582362,
      "kl": 0.3330078125,
      "learning_rate": 3.3969310374110817e-07,
      "loss": -0.0196,
      "num_tokens": 18967635.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 739.46875,
      "completions/mean_terminated_length": 698.8214721679688,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 2.015432098765432,
      "grad_norm": 1.2778458791627896,
      "kl": 0.2794189453125,
      "learning_rate": 3.3922627268659467e-07,
      "loss": -0.0159,
      "num_tokens": 18997986.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.255505234003067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 725.65625,
      "completions/mean_terminated_length": 626.2083740234375,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 2.0185185185185186,
      "grad_norm": 0.8902677301596652,
      "kl": 0.308837890625,
      "learning_rate": 3.387590848196456e-07,
      "loss": -0.0005,
      "num_tokens": 19027647.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 709.4375,
      "completions/mean_terminated_length": 676.8965454101562,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 2.021604938271605,
      "grad_norm": 1.1985951033383033,
      "kl": 0.29248046875,
      "learning_rate": 3.382915420085274e-07,
      "loss": 0.0065,
      "num_tokens": 19056717.0,
      "reward": 0.0,
      "reward_std": 0.12963305413722992,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 667.71875,
      "completions/mean_terminated_length": 630.862060546875,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 2.0246913580246915,
      "grad_norm": 1.6815841099967368,
      "kl": 0.2943115234375,
      "learning_rate": 3.3782364612292574e-07,
      "loss": 0.0686,
      "num_tokens": 19084496.0,
      "reward": 0.0,
      "reward_std": 0.26032719016075134,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 697.21875,
      "completions/mean_terminated_length": 621.8077392578125,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 2.0277777777777777,
      "grad_norm": 1.395411956711552,
      "kl": 0.3048095703125,
      "learning_rate": 3.3735539903393826e-07,
      "loss": -0.0058,
      "num_tokens": 19113567.0,
      "reward": 1.3969838619232178e-09,
      "reward_std": 0.14742480218410492,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 698.71875,
      "completions/mean_terminated_length": 638.4815063476562,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 2.0308641975308643,
      "grad_norm": 0.9317063020509377,
      "kl": 0.337646484375,
      "learning_rate": 3.368868026140672e-07,
      "loss": -0.0169,
      "num_tokens": 19142366.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 876.09375,
      "completions/mean_terminated_length": 728.1875,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 2.0339506172839505,
      "grad_norm": 1.2876846021592718,
      "kl": 0.2957763671875,
      "learning_rate": 3.364178587372115e-07,
      "loss": 0.0163,
      "num_tokens": 19177329.0,
      "reward": 0.0,
      "reward_std": 0.13859109580516815,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 737.0,
      "completions/mean_terminated_length": 670.7692260742188,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 2.037037037037037,
      "grad_norm": 1.061515415652727,
      "kl": 0.30517578125,
      "learning_rate": 3.359485692786597e-07,
      "loss": 0.0055,
      "num_tokens": 19207473.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 651.46875,
      "completions/mean_terminated_length": 612.9310302734375,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 2.0401234567901234,
      "grad_norm": 1.2138284059011546,
      "kl": 0.2908935546875,
      "learning_rate": 3.354789361150824e-07,
      "loss": -0.0062,
      "num_tokens": 19234564.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18632371723651886,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 781.28125,
      "completions/mean_terminated_length": 700.375,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 2.04320987654321,
      "grad_norm": 0.963881958120758,
      "kl": 0.2991943359375,
      "learning_rate": 3.350089611245246e-07,
      "loss": -0.012,
      "num_tokens": 19266345.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 664.53125,
      "completions/mean_terminated_length": 597.9629516601562,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 2.0462962962962963,
      "grad_norm": 1.5609560581092616,
      "kl": 0.31982421875,
      "learning_rate": 3.345386461863981e-07,
      "loss": 0.0188,
      "num_tokens": 19293898.0,
      "reward": 0.0,
      "reward_std": 0.19320368766784668,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 728.53125,
      "completions/mean_terminated_length": 645.7999877929688,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 2.049382716049383,
      "grad_norm": 1.3705062779467612,
      "kl": 0.2974853515625,
      "learning_rate": 3.340679931814743e-07,
      "loss": -0.0309,
      "num_tokens": 19324011.0,
      "reward": 0.0,
      "reward_std": 0.19272641837596893,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 758.53125,
      "completions/mean_terminated_length": 637.8636474609375,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 2.052469135802469,
      "grad_norm": 1.3954602989287956,
      "kl": 0.3125,
      "learning_rate": 3.3359700399187654e-07,
      "loss": -0.0351,
      "num_tokens": 19354904.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1973274052143097,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 719.0,
      "completions/mean_terminated_length": 633.5999755859375,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 2.0555555555555554,
      "grad_norm": 1.1553022962099326,
      "kl": 0.3074951171875,
      "learning_rate": 3.331256805010724e-07,
      "loss": 0.0275,
      "num_tokens": 19385080.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1587591916322708,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 693.59375,
      "completions/mean_terminated_length": 632.4074096679688,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 2.058641975308642,
      "grad_norm": 1.1050126577074906,
      "kl": 0.321044921875,
      "learning_rate": 3.326540245938666e-07,
      "loss": -0.009,
      "num_tokens": 19414171.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.16144341230392456,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 690.34375,
      "completions/mean_terminated_length": 628.5555419921875,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 2.0617283950617282,
      "grad_norm": 1.041452430840546,
      "kl": 0.322998046875,
      "learning_rate": 3.3218203815639265e-07,
      "loss": -0.0023,
      "num_tokens": 19442558.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.17738276720046997,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 940.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 635.28125,
      "completions/mean_terminated_length": 635.28125,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 2.064814814814815,
      "grad_norm": 0.9320130061694388,
      "kl": 0.33154296875,
      "learning_rate": 3.3170972307610654e-07,
      "loss": 0.0198,
      "num_tokens": 19470015.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 711.59375,
      "completions/mean_terminated_length": 666.9642944335938,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 2.067901234567901,
      "grad_norm": 1.0588532282410164,
      "kl": 0.3084716796875,
      "learning_rate": 3.312370812417779e-07,
      "loss": 0.0039,
      "num_tokens": 19499458.0,
      "reward": 0.0,
      "reward_std": 0.1587507575750351,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 732.6875,
      "completions/mean_terminated_length": 635.5833740234375,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 2.0709876543209877,
      "grad_norm": 0.6422280605695163,
      "kl": 0.32763671875,
      "learning_rate": 3.3076411454348336e-07,
      "loss": 0.0081,
      "num_tokens": 19529576.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 813.03125,
      "completions/mean_terminated_length": 686.4500122070312,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.074074074074074,
      "grad_norm": 0.5963639837233681,
      "kl": 0.2818603515625,
      "learning_rate": 3.3029082487259847e-07,
      "loss": -0.0212,
      "num_tokens": 19562377.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 733.28125,
      "completions/mean_terminated_length": 636.375,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 2.0771604938271606,
      "grad_norm": 0.9096796652560489,
      "kl": 0.335693359375,
      "learning_rate": 3.298172141217905e-07,
      "loss": -0.0103,
      "num_tokens": 19592410.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.13241708278656006,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 703.21875,
      "completions/mean_terminated_length": 643.8148193359375,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 2.080246913580247,
      "grad_norm": 0.9119271580757892,
      "kl": 0.30224609375,
      "learning_rate": 3.2934328418501064e-07,
      "loss": -0.0006,
      "num_tokens": 19621717.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.13241708278656006,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 873.0,
      "completions/mean_length": 719.28125,
      "completions/mean_terminated_length": 648.9615478515625,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 2.0833333333333335,
      "grad_norm": 0.9824268305954632,
      "kl": 0.322998046875,
      "learning_rate": 3.2886903695748647e-07,
      "loss": -0.0165,
      "num_tokens": 19651458.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 707.96875,
      "completions/mean_terminated_length": 662.8214721679688,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 2.0864197530864197,
      "grad_norm": 0.012430481591240248,
      "kl": 0.326904296875,
      "learning_rate": 3.2839447433571454e-07,
      "loss": 0.0003,
      "num_tokens": 19680221.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 680.46875,
      "completions/mean_terminated_length": 631.3928833007812,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 2.0895061728395063,
      "grad_norm": 1.000012060936064,
      "kl": 0.3388671875,
      "learning_rate": 3.279195982174524e-07,
      "loss": -0.0119,
      "num_tokens": 19708100.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 828.0,
      "completions/mean_length": 650.78125,
      "completions/mean_terminated_length": 597.4642944335938,
      "completions/min_length": 260.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 2.0925925925925926,
      "grad_norm": 0.9945218864494757,
      "kl": 0.326416015625,
      "learning_rate": 3.2744441050171136e-07,
      "loss": 0.0249,
      "num_tokens": 19735305.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 778.6875,
      "completions/mean_terminated_length": 696.9166870117188,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 2.095679012345679,
      "grad_norm": 1.8222405721065627,
      "kl": 0.31787109375,
      "learning_rate": 3.26968913088749e-07,
      "loss": 0.0398,
      "num_tokens": 19766883.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.14781978726387024,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 679.46875,
      "completions/mean_terminated_length": 599.9615478515625,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "epoch": 2.0987654320987654,
      "grad_norm": 0.00959192813711383,
      "kl": 0.32861328125,
      "learning_rate": 3.264931078800611e-07,
      "loss": 0.0003,
      "num_tokens": 19795778.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 699.90625,
      "completions/mean_terminated_length": 653.607177734375,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 2.1018518518518516,
      "grad_norm": 0.9485800842585024,
      "kl": 0.30859375,
      "learning_rate": 3.260169967783744e-07,
      "loss": 0.0224,
      "num_tokens": 19824251.0,
      "reward": 0.0,
      "reward_std": 0.1271844357252121,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 653.59375,
      "completions/mean_terminated_length": 568.1154174804688,
      "completions/min_length": 325.0,
      "completions/min_terminated_length": 325.0,
      "epoch": 2.1049382716049383,
      "grad_norm": 0.009865302595620653,
      "kl": 0.337646484375,
      "learning_rate": 3.255405816876389e-07,
      "loss": 0.0003,
      "num_tokens": 19851382.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 725.65625,
      "completions/mean_terminated_length": 626.2083740234375,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 2.1080246913580245,
      "grad_norm": 0.5532169517801894,
      "kl": 0.333740234375,
      "learning_rate": 3.250638645130204e-07,
      "loss": -0.0093,
      "num_tokens": 19880827.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 735.15625,
      "completions/mean_terminated_length": 654.2799682617188,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 2.111111111111111,
      "grad_norm": 1.0527534126942464,
      "kl": 0.2872314453125,
      "learning_rate": 3.2458684716089224e-07,
      "loss": 0.0084,
      "num_tokens": 19910764.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 729.875,
      "completions/mean_terminated_length": 662.0,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 2.1141975308641974,
      "grad_norm": 0.8607550987770649,
      "kl": 0.3092041015625,
      "learning_rate": 3.241095315388287e-07,
      "loss": 0.0034,
      "num_tokens": 19940504.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 626.75,
      "completions/mean_terminated_length": 570.0,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 2.117283950617284,
      "grad_norm": 0.8887261315001356,
      "kl": 0.372314453125,
      "learning_rate": 3.2363191955559656e-07,
      "loss": 0.0126,
      "num_tokens": 19966924.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 671.0625,
      "completions/mean_terminated_length": 634.5516967773438,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 2.1203703703703702,
      "grad_norm": 0.8792114198448667,
      "kl": 0.30126953125,
      "learning_rate": 3.231540131211478e-07,
      "loss": 0.0014,
      "num_tokens": 19994406.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 719.96875,
      "completions/mean_terminated_length": 618.625,
      "completions/min_length": 355.0,
      "completions/min_terminated_length": 355.0,
      "epoch": 2.123456790123457,
      "grad_norm": 1.4858621699695647,
      "kl": 0.3321533203125,
      "learning_rate": 3.22675814146612e-07,
      "loss": 0.0211,
      "num_tokens": 20024065.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19692783057689667,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 655.5625,
      "completions/mean_terminated_length": 602.9285888671875,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 2.126543209876543,
      "grad_norm": 1.157245962016017,
      "kl": 0.34033203125,
      "learning_rate": 3.221973245442883e-07,
      "loss": -0.0425,
      "num_tokens": 20051483.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 676.0,
      "completions/mean_terminated_length": 640.0,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 2.1296296296296298,
      "grad_norm": 0.9085926062898118,
      "kl": 0.35595703125,
      "learning_rate": 3.217185462276382e-07,
      "loss": 0.003,
      "num_tokens": 20079575.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 708.03125,
      "completions/mean_terminated_length": 662.8928833007812,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 2.132716049382716,
      "grad_norm": 1.0348106919231934,
      "kl": 0.29296875,
      "learning_rate": 3.2123948111127795e-07,
      "loss": 0.0375,
      "num_tokens": 20108488.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 647.9375,
      "completions/mean_terminated_length": 609.0344848632812,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 2.1358024691358026,
      "grad_norm": 1.051940699012955,
      "kl": 0.33154296875,
      "learning_rate": 3.2076013111097055e-07,
      "loss": -0.0279,
      "num_tokens": 20135570.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1258779913187027,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 868.0,
      "completions/mean_length": 703.1875,
      "completions/mean_terminated_length": 629.1538696289062,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 2.138888888888889,
      "grad_norm": 1.194276905036908,
      "kl": 0.32373046875,
      "learning_rate": 3.20280498143618e-07,
      "loss": -0.0139,
      "num_tokens": 20164616.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.06432675570249557,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 808.0,
      "completions/mean_length": 672.875,
      "completions/mean_terminated_length": 607.8518676757812,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 2.1419753086419755,
      "grad_norm": 1.57027050884244,
      "kl": 0.324951171875,
      "learning_rate": 3.1980058412725436e-07,
      "loss": -0.0408,
      "num_tokens": 20192964.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.14020457863807678,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 688.40625,
      "completions/mean_terminated_length": 626.25927734375,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 2.1450617283950617,
      "grad_norm": 0.9778800856256499,
      "kl": 0.33740234375,
      "learning_rate": 3.1932039098103723e-07,
      "loss": 0.0111,
      "num_tokens": 20220881.0,
      "reward": 0.0,
      "reward_std": 0.12601491808891296,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 648.71875,
      "completions/mean_terminated_length": 609.8965454101562,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 2.148148148148148,
      "grad_norm": 1.0504986019750875,
      "kl": 0.332275390625,
      "learning_rate": 3.188399206252406e-07,
      "loss": -0.0081,
      "num_tokens": 20247852.0,
      "reward": -3.259629011154175e-09,
      "reward_std": 0.13805748522281647,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 726.8125,
      "completions/mean_terminated_length": 658.2307739257812,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 2.1512345679012346,
      "grad_norm": 0.9309988346346298,
      "kl": 0.33056640625,
      "learning_rate": 3.183591749812468e-07,
      "loss": 0.0146,
      "num_tokens": 20277322.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.16529497504234314,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 715.625,
      "completions/mean_terminated_length": 683.72412109375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 2.154320987654321,
      "grad_norm": 0.9657662092438589,
      "kl": 0.3033447265625,
      "learning_rate": 3.1787815597153934e-07,
      "loss": 0.0175,
      "num_tokens": 20306886.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 837.0,
      "completions/mean_length": 752.28125,
      "completions/mean_terminated_length": 628.7727661132812,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 2.1574074074074074,
      "grad_norm": 1.0651472313027317,
      "kl": 0.317138671875,
      "learning_rate": 3.173968655196947e-07,
      "loss": 0.02,
      "num_tokens": 20338207.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 729.0625,
      "completions/mean_terminated_length": 686.9285888671875,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 2.1604938271604937,
      "grad_norm": 0.9675557243805978,
      "kl": 0.31494140625,
      "learning_rate": 3.1691530555037493e-07,
      "loss": 0.0208,
      "num_tokens": 20368389.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 737.5,
      "completions/mean_terminated_length": 607.2727661132812,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 2.1635802469135803,
      "grad_norm": 0.010486660684088683,
      "kl": 0.350830078125,
      "learning_rate": 3.164334779893198e-07,
      "loss": 0.0004,
      "num_tokens": 20398497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 647.9375,
      "completions/mean_terminated_length": 594.2142944335938,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 2.1666666666666665,
      "grad_norm": 1.0665304434407505,
      "kl": 0.33349609375,
      "learning_rate": 3.159513847633393e-07,
      "loss": 0.0416,
      "num_tokens": 20425915.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 697.875,
      "completions/mean_terminated_length": 651.2857666015625,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 2.169753086419753,
      "grad_norm": 1.2849560268418567,
      "kl": 0.31201171875,
      "learning_rate": 3.1546902780030555e-07,
      "loss": -0.0599,
      "num_tokens": 20455459.0,
      "reward": 0.0,
      "reward_std": 0.14231424033641815,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 656.9375,
      "completions/mean_terminated_length": 604.5,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 2.1728395061728394,
      "grad_norm": 0.881842731943163,
      "kl": 0.343017578125,
      "learning_rate": 3.1498640902914565e-07,
      "loss": 0.0025,
      "num_tokens": 20482961.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 703.375,
      "completions/mean_terminated_length": 644.0,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 2.175925925925926,
      "grad_norm": 0.023534372819004366,
      "kl": 0.369140625,
      "learning_rate": 3.1450353037983346e-07,
      "loss": 0.0004,
      "num_tokens": 20511785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 682.9375,
      "completions/mean_terminated_length": 587.4400024414062,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 2.1790123456790123,
      "grad_norm": 1.4509484141446072,
      "kl": 0.32421875,
      "learning_rate": 3.140203937833821e-07,
      "loss": 0.033,
      "num_tokens": 20539951.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.25439155101776123,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 697.03125,
      "completions/mean_terminated_length": 605.47998046875,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 2.182098765432099,
      "grad_norm": 1.5252535390561108,
      "kl": 0.3302001953125,
      "learning_rate": 3.135370011718364e-07,
      "loss": -0.0077,
      "num_tokens": 20569004.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.11586824059486389,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 633.375,
      "completions/mean_terminated_length": 592.9655151367188,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 2.185185185185185,
      "grad_norm": 1.5863713496813534,
      "kl": 0.347900390625,
      "learning_rate": 3.1305335447826477e-07,
      "loss": -0.0169,
      "num_tokens": 20595440.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2432374656200409,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 687.3125,
      "completions/mean_terminated_length": 593.0399780273438,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 2.1882716049382718,
      "grad_norm": 0.6675990384614131,
      "kl": 0.3150634765625,
      "learning_rate": 3.125694556367517e-07,
      "loss": 0.0123,
      "num_tokens": 20624118.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 663.71875,
      "completions/mean_terminated_length": 612.25,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 2.191358024691358,
      "grad_norm": 0.9065284425566196,
      "kl": 0.354736328125,
      "learning_rate": 3.1208530658239e-07,
      "loss": 0.0227,
      "num_tokens": 20651893.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 702.875,
      "completions/mean_terminated_length": 628.7692260742188,
      "completions/min_length": 317.0,
      "completions/min_terminated_length": 317.0,
      "epoch": 2.1944444444444446,
      "grad_norm": 0.5921535166475328,
      "kl": 0.3311767578125,
      "learning_rate": 3.1160090925127325e-07,
      "loss": 0.0264,
      "num_tokens": 20680765.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 721.9375,
      "completions/mean_terminated_length": 666.0,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 2.197530864197531,
      "grad_norm": 0.016570738190573833,
      "kl": 0.3167724609375,
      "learning_rate": 3.1111626558048777e-07,
      "loss": 0.0003,
      "num_tokens": 20710403.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 706.25,
      "completions/mean_terminated_length": 647.4074096679688,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 2.200617283950617,
      "grad_norm": 0.7071664428786564,
      "kl": 0.33056640625,
      "learning_rate": 3.1063137750810493e-07,
      "loss": -0.0117,
      "num_tokens": 20739203.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 761.625,
      "completions/mean_terminated_length": 674.1666870117188,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 2.2037037037037037,
      "grad_norm": 0.7667457104932189,
      "kl": 0.3172607421875,
      "learning_rate": 3.101462469731735e-07,
      "loss": -0.0108,
      "num_tokens": 20770355.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 930.0,
      "completions/mean_length": 700.21875,
      "completions/mean_terminated_length": 666.72412109375,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 2.20679012345679,
      "grad_norm": 0.9688229532526872,
      "kl": 0.2862548828125,
      "learning_rate": 3.0966087591571184e-07,
      "loss": -0.0277,
      "num_tokens": 20799314.0,
      "reward": 0.0,
      "reward_std": 0.1463371366262436,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 710.71875,
      "completions/mean_terminated_length": 623.0,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 2.2098765432098766,
      "grad_norm": 0.13863287573914862,
      "kl": 0.3748779296875,
      "learning_rate": 3.091752662767001e-07,
      "loss": 0.0004,
      "num_tokens": 20828441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 733.5625,
      "completions/mean_terminated_length": 652.239990234375,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 2.212962962962963,
      "grad_norm": 1.1987272565380604,
      "kl": 0.299072265625,
      "learning_rate": 3.0868941999807274e-07,
      "loss": 0.0564,
      "num_tokens": 20858479.0,
      "reward": 0.0,
      "reward_std": 0.17122279107570648,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 726.8125,
      "completions/mean_terminated_length": 671.7777709960938,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 2.2160493827160495,
      "grad_norm": 0.008574960214126948,
      "kl": 0.3067626953125,
      "learning_rate": 3.082033390227102e-07,
      "loss": 0.0003,
      "num_tokens": 20888401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 741.09375,
      "completions/mean_terminated_length": 661.8800048828125,
      "completions/min_length": 261.0,
      "completions/min_terminated_length": 261.0,
      "epoch": 2.2191358024691357,
      "grad_norm": 0.5220758773227693,
      "kl": 0.3082275390625,
      "learning_rate": 3.0771702529443163e-07,
      "loss": 0.0192,
      "num_tokens": 20919376.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 699.90625,
      "completions/mean_terminated_length": 653.607177734375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 2.2222222222222223,
      "grad_norm": 1.2888684625575575,
      "kl": 0.3294677734375,
      "learning_rate": 3.0723048075798694e-07,
      "loss": -0.0091,
      "num_tokens": 20948109.0,
      "reward": 0.0,
      "reward_std": 0.2004445195198059,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 758.25,
      "completions/mean_terminated_length": 669.6666870117188,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 2.2253086419753085,
      "grad_norm": 0.5431356344557754,
      "kl": 0.3170166015625,
      "learning_rate": 3.0674370735904917e-07,
      "loss": 0.0191,
      "num_tokens": 20978833.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 841.0,
      "completions/mean_length": 660.625,
      "completions/mean_terminated_length": 576.7692260742188,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 2.228395061728395,
      "grad_norm": 0.7805457357513953,
      "kl": 0.322265625,
      "learning_rate": 3.0625670704420634e-07,
      "loss": 0.0071,
      "num_tokens": 21006805.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 784.875,
      "completions/mean_terminated_length": 676.1818237304688,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 2.2314814814814814,
      "grad_norm": 0.8545715771313919,
      "kl": 0.300537109375,
      "learning_rate": 3.057694817609539e-07,
      "loss": -0.0007,
      "num_tokens": 21038533.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.11199356615543365,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 873.0,
      "completions/mean_length": 682.34375,
      "completions/mean_terminated_length": 633.5357666015625,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 2.234567901234568,
      "grad_norm": 0.6239766259790861,
      "kl": 0.359375,
      "learning_rate": 3.0528203345768717e-07,
      "loss": 0.0124,
      "num_tokens": 21066944.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 756.6875,
      "completions/mean_terminated_length": 681.8399658203125,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 2.2376543209876543,
      "grad_norm": 1.1670403229338628,
      "kl": 0.294189453125,
      "learning_rate": 3.047943640836931e-07,
      "loss": -0.0422,
      "num_tokens": 21097602.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 697.03125,
      "completions/mean_terminated_length": 650.3214721679688,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 2.240740740740741,
      "grad_norm": 1.6386161867048643,
      "kl": 0.339111328125,
      "learning_rate": 3.0430647558914284e-07,
      "loss": 0.0218,
      "num_tokens": 21126391.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19169270992279053,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 675.0625,
      "completions/mean_terminated_length": 663.8064575195312,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 2.243827160493827,
      "grad_norm": 0.7084404263887658,
      "kl": 0.3253173828125,
      "learning_rate": 3.038183699250837e-07,
      "loss": -0.0072,
      "num_tokens": 21154069.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 741.1875,
      "completions/mean_terminated_length": 662.0,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 2.246913580246914,
      "grad_norm": 0.5896845115900525,
      "kl": 0.299072265625,
      "learning_rate": 3.0333004904343153e-07,
      "loss": 0.0241,
      "num_tokens": 21184135.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 773.3125,
      "completions/mean_terminated_length": 675.2174072265625,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 2.25,
      "grad_norm": 0.10943753744940068,
      "kl": 0.39892578125,
      "learning_rate": 3.0284151489696264e-07,
      "loss": 0.0004,
      "num_tokens": 21215541.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 822.0,
      "completions/mean_length": 733.3125,
      "completions/mean_terminated_length": 601.1818237304688,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 2.253086419753086,
      "grad_norm": 1.2409572765841255,
      "kl": 0.3126220703125,
      "learning_rate": 3.023527694393064e-07,
      "loss": 0.0207,
      "num_tokens": 21245451.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.1613985002040863,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 679.25,
      "completions/mean_terminated_length": 643.586181640625,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 2.256172839506173,
      "grad_norm": 0.7164065146154263,
      "kl": 0.287109375,
      "learning_rate": 3.0186381462493704e-07,
      "loss": 0.0127,
      "num_tokens": 21273583.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 720.8125,
      "completions/mean_terminated_length": 664.6666870117188,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 2.259259259259259,
      "grad_norm": 0.5526665202311036,
      "kl": 0.2779541015625,
      "learning_rate": 3.0137465240916614e-07,
      "loss": 0.0142,
      "num_tokens": 21303117.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 713.28125,
      "completions/mean_terminated_length": 641.5769653320312,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 2.2623456790123457,
      "grad_norm": 1.29598252946043,
      "kl": 0.297119140625,
      "learning_rate": 3.008852847481346e-07,
      "loss": -0.0244,
      "num_tokens": 21332378.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.21016982197761536,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 697.1875,
      "completions/mean_terminated_length": 663.3793334960938,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 2.265432098765432,
      "grad_norm": 1.3865288813904986,
      "kl": 0.2939453125,
      "learning_rate": 3.003957135988049e-07,
      "loss": -0.0054,
      "num_tokens": 21361148.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.21970567107200623,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096293926239,
      "step": 734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 664.65625,
      "completions/mean_terminated_length": 613.3214721679688,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 2.2685185185185186,
      "grad_norm": 0.4971952411562597,
      "kl": 0.343505859375,
      "learning_rate": 2.999059409189533e-07,
      "loss": 0.0221,
      "num_tokens": 21388909.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 767.78125,
      "completions/mean_terminated_length": 651.3181762695312,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 2.271604938271605,
      "grad_norm": 0.011462006251291434,
      "kl": 0.3271484375,
      "learning_rate": 2.9941596866716174e-07,
      "loss": 0.0003,
      "num_tokens": 21420290.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 738.375,
      "completions/mean_terminated_length": 643.1666870117188,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 2.2746913580246915,
      "grad_norm": 0.7485629636877662,
      "kl": 0.32470703125,
      "learning_rate": 2.989257988028105e-07,
      "loss": 0.0215,
      "num_tokens": 21450518.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 750.59375,
      "completions/mean_terminated_length": 626.3181762695312,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 2.2777777777777777,
      "grad_norm": 1.1199405288147424,
      "kl": 0.32666015625,
      "learning_rate": 2.984354332860702e-07,
      "loss": 0.0284,
      "num_tokens": 21481077.0,
      "reward": -4.190951585769653e-09,
      "reward_std": 0.20220693945884705,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 690.09375,
      "completions/mean_terminated_length": 642.3928833007812,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 2.2808641975308643,
      "grad_norm": 1.2895971713547376,
      "kl": 0.298828125,
      "learning_rate": 2.979448740778935e-07,
      "loss": -0.0068,
      "num_tokens": 21509788.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1973792016506195,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 775.625,
      "completions/mean_terminated_length": 706.0799560546875,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 2.2839506172839505,
      "grad_norm": 1.0120060608074388,
      "kl": 0.287353515625,
      "learning_rate": 2.9745412314000786e-07,
      "loss": 0.0098,
      "num_tokens": 21541356.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 816.0625,
      "completions/mean_terminated_length": 734.6956787109375,
      "completions/min_length": 554.0,
      "completions/min_terminated_length": 554.0,
      "epoch": 2.287037037037037,
      "grad_norm": 0.5461073719288805,
      "kl": 0.292724609375,
      "learning_rate": 2.9696318243490746e-07,
      "loss": -0.005,
      "num_tokens": 21574434.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 643.4375,
      "completions/mean_terminated_length": 604.0689697265625,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 2.2901234567901234,
      "grad_norm": 1.0759940179355871,
      "kl": 0.3154296875,
      "learning_rate": 2.9647205392584533e-07,
      "loss": -0.0425,
      "num_tokens": 21601020.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1582072526216507,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 692.53125,
      "completions/mean_terminated_length": 616.0385131835938,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 2.29320987654321,
      "grad_norm": 0.6491421542546149,
      "kl": 0.31689453125,
      "learning_rate": 2.959807395768255e-07,
      "loss": -0.0451,
      "num_tokens": 21629493.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 746.1875,
      "completions/mean_terminated_length": 653.5833740234375,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 2.2962962962962963,
      "grad_norm": 0.6302045720515893,
      "kl": 0.275146484375,
      "learning_rate": 2.95489241352595e-07,
      "loss": 0.0297,
      "num_tokens": 21659619.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 778.0,
      "completions/mean_terminated_length": 696.0,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 2.299382716049383,
      "grad_norm": 1.1125022853901245,
      "kl": 0.294189453125,
      "learning_rate": 2.949975612186366e-07,
      "loss": -0.0008,
      "num_tokens": 21691223.0,
      "reward": 0.0,
      "reward_std": 0.12571673095226288,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 866.0,
      "completions/mean_length": 692.71875,
      "completions/mean_terminated_length": 616.2692260742188,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 2.302469135802469,
      "grad_norm": 0.8809908359461416,
      "kl": 0.3489990234375,
      "learning_rate": 2.9450570114116014e-07,
      "loss": 0.035,
      "num_tokens": 21719174.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 709.46875,
      "completions/mean_terminated_length": 636.8846435546875,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 2.3055555555555554,
      "grad_norm": 0.013393503118300074,
      "kl": 0.31298828125,
      "learning_rate": 2.9401366308709513e-07,
      "loss": 0.0003,
      "num_tokens": 21748533.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 739.375,
      "completions/mean_terminated_length": 644.5,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 2.308641975308642,
      "grad_norm": 0.009935897726793609,
      "kl": 0.2918701171875,
      "learning_rate": 2.9352144902408296e-07,
      "loss": 0.0003,
      "num_tokens": 21778437.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 759.25,
      "completions/mean_terminated_length": 685.1199951171875,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 2.3117283950617282,
      "grad_norm": 1.0034108345721173,
      "kl": 0.25439453125,
      "learning_rate": 2.930290609204686e-07,
      "loss": 0.0479,
      "num_tokens": 21809597.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15807422995567322,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 750.78125,
      "completions/mean_terminated_length": 626.5909423828125,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 2.314814814814815,
      "grad_norm": 1.1636573404859127,
      "kl": 0.2874755859375,
      "learning_rate": 2.925365007452933e-07,
      "loss": 0.0123,
      "num_tokens": 21839998.0,
      "reward": -1.3969838619232178e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 686.5,
      "completions/mean_terminated_length": 664.0000610351562,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 2.317901234567901,
      "grad_norm": 1.4033954819531933,
      "kl": 0.307861328125,
      "learning_rate": 2.920437704682861e-07,
      "loss": 0.0044,
      "num_tokens": 21868306.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.17682674527168274,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 720.4375,
      "completions/mean_terminated_length": 650.3846435546875,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 2.3209876543209877,
      "grad_norm": 0.8705940481720116,
      "kl": 0.2899169921875,
      "learning_rate": 2.915508720598566e-07,
      "loss": -0.0247,
      "num_tokens": 21897964.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 715.3125,
      "completions/mean_terminated_length": 644.0769653320312,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 2.324074074074074,
      "grad_norm": 0.6377240184037994,
      "kl": 0.2913818359375,
      "learning_rate": 2.910578074910865e-07,
      "loss": 0.0214,
      "num_tokens": 21927366.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 733.96875,
      "completions/mean_terminated_length": 680.25927734375,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 2.3271604938271606,
      "grad_norm": 0.8587335816847671,
      "kl": 0.2869873046875,
      "learning_rate": 2.9056457873372213e-07,
      "loss": -0.0115,
      "num_tokens": 21957433.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 751.40625,
      "completions/mean_terminated_length": 675.0799560546875,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 2.330246913580247,
      "grad_norm": 0.976589983769839,
      "kl": 0.33203125,
      "learning_rate": 2.9007118776016635e-07,
      "loss": -0.0064,
      "num_tokens": 21987746.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 688.6875,
      "completions/mean_terminated_length": 640.7857666015625,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 2.3333333333333335,
      "grad_norm": 0.8140323731714041,
      "kl": 0.30712890625,
      "learning_rate": 2.895776365434706e-07,
      "loss": 0.0236,
      "num_tokens": 22015872.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 716.875,
      "completions/mean_terminated_length": 646.0,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 2.3364197530864197,
      "grad_norm": 0.06551671188304388,
      "kl": 0.378662109375,
      "learning_rate": 2.8908392705732724e-07,
      "loss": 0.0004,
      "num_tokens": 22044812.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 715.1875,
      "completions/mean_terminated_length": 671.0714721679688,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 2.3395061728395063,
      "grad_norm": 1.1810752306099652,
      "kl": 0.31884765625,
      "learning_rate": 2.885900612760616e-07,
      "loss": -0.0153,
      "num_tokens": 22073846.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.19472569227218628,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 687.25,
      "completions/mean_terminated_length": 652.413818359375,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 2.3425925925925926,
      "grad_norm": 1.07281953205766,
      "kl": 0.27734375,
      "learning_rate": 2.8809604117462397e-07,
      "loss": -0.0141,
      "num_tokens": 22101970.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20704184472560883,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 737.90625,
      "completions/mean_terminated_length": 657.7999877929688,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 2.3456790123456788,
      "grad_norm": 1.3192607792629212,
      "kl": 0.3134765625,
      "learning_rate": 2.876018687285817e-07,
      "loss": 0.0074,
      "num_tokens": 22132591.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.15570716559886932,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 706.59375,
      "completions/mean_terminated_length": 647.8148193359375,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 2.3487654320987654,
      "grad_norm": 0.9535517686962172,
      "kl": 0.30419921875,
      "learning_rate": 2.8710754591411147e-07,
      "loss": 0.0004,
      "num_tokens": 22161702.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 862.0,
      "completions/mean_length": 695.34375,
      "completions/mean_terminated_length": 661.3448486328125,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 2.351851851851852,
      "grad_norm": 0.7668050290613273,
      "kl": 0.2891845703125,
      "learning_rate": 2.8661307470799114e-07,
      "loss": -0.002,
      "num_tokens": 22190205.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 695.84375,
      "completions/mean_terminated_length": 635.0740966796875,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 2.3549382716049383,
      "grad_norm": 1.0176226128200823,
      "kl": 0.302978515625,
      "learning_rate": 2.861184570875921e-07,
      "loss": -0.041,
      "num_tokens": 22218836.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 702.65625,
      "completions/mean_terminated_length": 656.75,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 2.3580246913580245,
      "grad_norm": 0.6903004677162509,
      "kl": 0.30908203125,
      "learning_rate": 2.856236950308711e-07,
      "loss": 0.0013,
      "num_tokens": 22247517.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 762.71875,
      "completions/mean_terminated_length": 689.5599975585938,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 2.361111111111111,
      "grad_norm": 0.6495978029567093,
      "kl": 0.276123046875,
      "learning_rate": 2.851287905163628e-07,
      "loss": 0.0194,
      "num_tokens": 22278528.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 725.4375,
      "completions/mean_terminated_length": 694.5516967773438,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 2.3641975308641974,
      "grad_norm": 0.8074552020078114,
      "kl": 0.3009033203125,
      "learning_rate": 2.8463374552317123e-07,
      "loss": 0.0075,
      "num_tokens": 22308238.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 850.0,
      "completions/mean_length": 695.59375,
      "completions/mean_terminated_length": 661.6206665039062,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 2.367283950617284,
      "grad_norm": 0.543797783597461,
      "kl": 0.3115234375,
      "learning_rate": 2.8413856203096226e-07,
      "loss": 0.0127,
      "num_tokens": 22336429.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 739.15625,
      "completions/mean_terminated_length": 627.6956787109375,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 2.3703703703703702,
      "grad_norm": 0.9030282705032256,
      "kl": 0.31591796875,
      "learning_rate": 2.836432420199557e-07,
      "loss": -0.01,
      "num_tokens": 22367050.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.04620163142681122,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 770.5,
      "completions/mean_terminated_length": 712.0,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 2.373456790123457,
      "grad_norm": 6.818904619799035,
      "kl": 2.0286865234375,
      "learning_rate": 2.831477874709172e-07,
      "loss": -0.0081,
      "num_tokens": 22399074.0,
      "reward": 0.0,
      "reward_std": 0.178672194480896,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 796.5,
      "completions/mean_terminated_length": 677.3333740234375,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 2.376543209876543,
      "grad_norm": 0.623314486037102,
      "kl": 0.2818603515625,
      "learning_rate": 2.826522003651504e-07,
      "loss": 0.0174,
      "num_tokens": 22431374.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 751.625,
      "completions/mean_terminated_length": 660.8333740234375,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 2.3796296296296298,
      "grad_norm": 0.4352854882504204,
      "kl": 0.28125,
      "learning_rate": 2.8215648268448926e-07,
      "loss": 0.0304,
      "num_tokens": 22461682.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 727.1875,
      "completions/mean_terminated_length": 644.0799560546875,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 2.382716049382716,
      "grad_norm": 0.8684794622121298,
      "kl": 0.281005859375,
      "learning_rate": 2.8166063641128963e-07,
      "loss": 0.0193,
      "num_tokens": 22491364.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 733.0,
      "completions/mean_terminated_length": 702.8965454101562,
      "completions/min_length": 532.0,
      "completions/min_terminated_length": 532.0,
      "epoch": 2.3858024691358026,
      "grad_norm": 0.05524853583081608,
      "kl": 0.317138671875,
      "learning_rate": 2.8116466352842165e-07,
      "loss": 0.0003,
      "num_tokens": 22520964.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 655.71875,
      "completions/mean_terminated_length": 603.107177734375,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 2.388888888888889,
      "grad_norm": 1.369594835323384,
      "kl": 0.273681640625,
      "learning_rate": 2.80668566019262e-07,
      "loss": 0.0083,
      "num_tokens": 22548351.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.2107936292886734,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 819.0,
      "completions/mean_length": 700.1875,
      "completions/mean_terminated_length": 609.5199584960938,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 2.3919753086419755,
      "grad_norm": 0.8060769779208158,
      "kl": 0.294921875,
      "learning_rate": 2.8017234586768534e-07,
      "loss": -0.0052,
      "num_tokens": 22577225.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.12777692079544067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 691.78125,
      "completions/mean_terminated_length": 657.413818359375,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 2.3950617283950617,
      "grad_norm": 1.0375051388367698,
      "kl": 0.296875,
      "learning_rate": 2.796760050580571e-07,
      "loss": -0.0175,
      "num_tokens": 22605962.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.15559504926204681,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 745.53125,
      "completions/mean_terminated_length": 693.9629516601562,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 2.398148148148148,
      "grad_norm": 0.8340971492924649,
      "kl": 0.310546875,
      "learning_rate": 2.7917954557522503e-07,
      "loss": -0.0313,
      "num_tokens": 22636531.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 703.03125,
      "completions/mean_terminated_length": 596.0416870117188,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 2.4012345679012346,
      "grad_norm": 1.2557599420112906,
      "kl": 0.2816162109375,
      "learning_rate": 2.786829694045116e-07,
      "loss": -0.0246,
      "num_tokens": 22665720.0,
      "reward": 0.0,
      "reward_std": 0.18573541939258575,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 757.53125,
      "completions/mean_terminated_length": 653.2608642578125,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 2.4043209876543212,
      "grad_norm": 1.159626860385025,
      "kl": 0.2725830078125,
      "learning_rate": 2.7818627853170585e-07,
      "loss": -0.0034,
      "num_tokens": 22696929.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 699.375,
      "completions/mean_terminated_length": 653.0,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 2.4074074074074074,
      "grad_norm": 1.1340259348539956,
      "kl": 0.27978515625,
      "learning_rate": 2.7768947494305545e-07,
      "loss": -0.0003,
      "num_tokens": 22725821.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.14842106401920319,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 712.8125,
      "completions/mean_terminated_length": 668.357177734375,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 2.4104938271604937,
      "grad_norm": 0.6779508979306605,
      "kl": 0.24578857421875,
      "learning_rate": 2.7719256062525884e-07,
      "loss": 0.006,
      "num_tokens": 22755187.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 793.0625,
      "completions/mean_terminated_length": 750.2963256835938,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 2.4135802469135803,
      "grad_norm": 0.00910277949727657,
      "kl": 0.2523193359375,
      "learning_rate": 2.766955375654573e-07,
      "loss": 0.0003,
      "num_tokens": 22787733.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 762.1875,
      "completions/mean_terminated_length": 688.8800048828125,
      "completions/min_length": 575.0,
      "completions/min_terminated_length": 575.0,
      "epoch": 2.4166666666666665,
      "grad_norm": 1.2121234705060395,
      "kl": 0.2923583984375,
      "learning_rate": 2.7619840775122695e-07,
      "loss": -0.0294,
      "num_tokens": 22818799.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1531248539686203,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 699.09375,
      "completions/mean_terminated_length": 665.4827270507812,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 2.419753086419753,
      "grad_norm": 1.3078284181942887,
      "kl": 0.28662109375,
      "learning_rate": 2.7570117317057087e-07,
      "loss": 0.003,
      "num_tokens": 22847198.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.1515917330980301,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 669.0,
      "completions/mean_terminated_length": 632.27587890625,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 2.4228395061728394,
      "grad_norm": 1.3444904972323186,
      "kl": 0.30908203125,
      "learning_rate": 2.7520383581191085e-07,
      "loss": -0.043,
      "num_tokens": 22874666.0,
      "reward": 0.0,
      "reward_std": 0.17122045159339905,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 683.8125,
      "completions/mean_terminated_length": 648.6206665039062,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 2.425925925925926,
      "grad_norm": 1.4438116017026041,
      "kl": 0.2852783203125,
      "learning_rate": 2.7470639766408003e-07,
      "loss": -0.012,
      "num_tokens": 22903256.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.21133752167224884,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 738.9375,
      "completions/mean_terminated_length": 659.1199951171875,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 2.4290123456790123,
      "grad_norm": 1.6836583122916187,
      "kl": 0.447021484375,
      "learning_rate": 2.7420886071631455e-07,
      "loss": 0.0786,
      "num_tokens": 22933430.0,
      "reward": 0.0,
      "reward_std": 0.1577567458152771,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 859.0,
      "completions/mean_length": 757.15625,
      "completions/mean_terminated_length": 635.8636474609375,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 2.432098765432099,
      "grad_norm": 1.4462823123660733,
      "kl": 0.2528076171875,
      "learning_rate": 2.7371122695824534e-07,
      "loss": -0.0423,
      "num_tokens": 22964251.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.2167874276638031,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 716.96875,
      "completions/mean_terminated_length": 673.107177734375,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 2.435185185185185,
      "grad_norm": 0.8324573179981034,
      "kl": 0.2869873046875,
      "learning_rate": 2.732134983798907e-07,
      "loss": -0.0033,
      "num_tokens": 22993358.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 652.34375,
      "completions/mean_terminated_length": 613.8965454101562,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 2.4382716049382718,
      "grad_norm": 0.8389245706202125,
      "kl": 0.287109375,
      "learning_rate": 2.727156769716482e-07,
      "loss": -0.0081,
      "num_tokens": 23020409.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 721.96875,
      "completions/mean_terminated_length": 690.72412109375,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 2.441358024691358,
      "grad_norm": 0.007722079367322754,
      "kl": 0.2696533203125,
      "learning_rate": 2.722177647242863e-07,
      "loss": 0.0003,
      "num_tokens": 23049816.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 757.84375,
      "completions/mean_terminated_length": 696.423095703125,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 2.4444444444444446,
      "grad_norm": 1.038484419275493,
      "kl": 0.271728515625,
      "learning_rate": 2.717197636289373e-07,
      "loss": -0.0391,
      "num_tokens": 23080615.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15867924690246582,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 726.84375,
      "completions/mean_terminated_length": 684.3928833007812,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 2.447530864197531,
      "grad_norm": 1.0483591323718822,
      "kl": 0.300048828125,
      "learning_rate": 2.712216756770881e-07,
      "loss": 0.0438,
      "num_tokens": 23110310.0,
      "reward": 0.0,
      "reward_std": 0.15284463763237,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 813.8125,
      "completions/mean_terminated_length": 718.2727661132812,
      "completions/min_length": 515.0,
      "completions/min_terminated_length": 515.0,
      "epoch": 2.450617283950617,
      "grad_norm": 1.4077606143842762,
      "kl": 0.2593994140625,
      "learning_rate": 2.7072350286057354e-07,
      "loss": -0.0301,
      "num_tokens": 23143416.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.1905532032251358,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 784.875,
      "completions/mean_terminated_length": 676.1818237304688,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 2.4537037037037037,
      "grad_norm": 0.009587927875737758,
      "kl": 0.2674560546875,
      "learning_rate": 2.7022524717156734e-07,
      "loss": 0.0003,
      "num_tokens": 23174632.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 734.09375,
      "completions/mean_terminated_length": 637.4583740234375,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 2.45679012345679,
      "grad_norm": 1.0297913705744441,
      "kl": 0.2774658203125,
      "learning_rate": 2.6972691060257504e-07,
      "loss": 0.0184,
      "num_tokens": 23204627.0,
      "reward": -4.6566128730773926e-09,
      "reward_std": 0.11199356615543365,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 777.90625,
      "completions/mean_terminated_length": 630.25,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 2.4598765432098766,
      "grad_norm": 1.4406329964664708,
      "kl": 0.2735595703125,
      "learning_rate": 2.6922849514642524e-07,
      "loss": -0.0354,
      "num_tokens": 23236272.0,
      "reward": -1.6763806343078613e-08,
      "reward_std": 0.16224205493927002,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 755.3125,
      "completions/mean_terminated_length": 716.9285888671875,
      "completions/min_length": 525.0,
      "completions/min_terminated_length": 525.0,
      "epoch": 2.462962962962963,
      "grad_norm": 0.9164887141421001,
      "kl": 0.2772216796875,
      "learning_rate": 2.687300027962624e-07,
      "loss": 0.0078,
      "num_tokens": 23267274.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.06432675570249557,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 718.78125,
      "completions/mean_terminated_length": 662.25927734375,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 2.4660493827160495,
      "grad_norm": 1.534588977749662,
      "kl": 0.2891845703125,
      "learning_rate": 2.682314355455381e-07,
      "loss": 0.0485,
      "num_tokens": 23296879.0,
      "reward": -1.4901161193847656e-08,
      "reward_std": 0.21280820667743683,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.43994131684303284,
      "step": 799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 786.34375,
      "completions/mean_terminated_length": 707.125,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 2.4691358024691357,
      "grad_norm": 0.9315084172418175,
      "kl": 0.26025390625,
      "learning_rate": 2.677327953880038e-07,
      "loss": -0.0142,
      "num_tokens": 23328834.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 749.84375,
      "completions/mean_terminated_length": 699.0740966796875,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 2.4722222222222223,
      "grad_norm": 0.7175811500507011,
      "kl": 0.2691650390625,
      "learning_rate": 2.6723408431770214e-07,
      "loss": -0.0131,
      "num_tokens": 23359261.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 781.03125,
      "completions/mean_terminated_length": 713.0,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 2.4753086419753085,
      "grad_norm": 0.9845812387614366,
      "kl": 0.2730712890625,
      "learning_rate": 2.6673530432895957e-07,
      "loss": 0.0095,
      "num_tokens": 23390878.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 686.65625,
      "completions/mean_terminated_length": 664.1666870117188,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 2.478395061728395,
      "grad_norm": 1.2556912955304715,
      "kl": 0.2806396484375,
      "learning_rate": 2.6623645741637815e-07,
      "loss": -0.0074,
      "num_tokens": 23418823.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.12057675421237946,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 847.0,
      "completions/mean_length": 655.21875,
      "completions/mean_terminated_length": 602.5357666015625,
      "completions/min_length": 304.0,
      "completions/min_terminated_length": 304.0,
      "epoch": 2.4814814814814814,
      "grad_norm": 0.7570621661561733,
      "kl": 0.2835693359375,
      "learning_rate": 2.6573754557482746e-07,
      "loss": -0.0092,
      "num_tokens": 23445842.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 651.3125,
      "completions/mean_terminated_length": 626.4666748046875,
      "completions/min_length": 304.0,
      "completions/min_terminated_length": 304.0,
      "epoch": 2.484567901234568,
      "grad_norm": 0.8131351621463883,
      "kl": 0.303466796875,
      "learning_rate": 2.652385707994369e-07,
      "loss": -0.0304,
      "num_tokens": 23472936.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.1317192167043686,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 720.03125,
      "completions/mean_terminated_length": 699.7667236328125,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 2.4876543209876543,
      "grad_norm": 1.0608940637272115,
      "kl": 0.280029296875,
      "learning_rate": 2.6473953508558726e-07,
      "loss": 0.0071,
      "num_tokens": 23502509.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 721.75,
      "completions/mean_terminated_length": 665.7777709960938,
      "completions/min_length": 526.0,
      "completions/min_terminated_length": 526.0,
      "epoch": 2.490740740740741,
      "grad_norm": 0.7478641121751027,
      "kl": 0.2586669921875,
      "learning_rate": 2.6424044042890334e-07,
      "loss": 0.0002,
      "num_tokens": 23531869.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 711.59375,
      "completions/mean_terminated_length": 639.5,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 2.493827160493827,
      "grad_norm": 0.9939590495143553,
      "kl": 0.27783203125,
      "learning_rate": 2.6374128882524527e-07,
      "loss": -0.0195,
      "num_tokens": 23561128.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 768.71875,
      "completions/mean_terminated_length": 709.8077392578125,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 2.496913580246914,
      "grad_norm": 0.9438428030549388,
      "kl": 0.2557373046875,
      "learning_rate": 2.6324208227070136e-07,
      "loss": -0.0343,
      "num_tokens": 23592107.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 689.5,
      "completions/mean_terminated_length": 612.3077392578125,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 2.5,
      "grad_norm": 1.2342092287156805,
      "kl": 0.2899169921875,
      "learning_rate": 2.6274282276157934e-07,
      "loss": 0.0197,
      "num_tokens": 23620143.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.15541993081569672,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 759.15625,
      "completions/mean_terminated_length": 685.0,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 2.503086419753086,
      "grad_norm": 1.0885787695527718,
      "kl": 0.2723388671875,
      "learning_rate": 2.622435122943987e-07,
      "loss": -0.0143,
      "num_tokens": 23651120.0,
      "reward": 0.0,
      "reward_std": 0.17995095252990723,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 705.59375,
      "completions/mean_terminated_length": 672.6551513671875,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 2.506172839506173,
      "grad_norm": 2.2527079144224795,
      "kl": 0.273681640625,
      "learning_rate": 2.61744152865883e-07,
      "loss": 0.1885,
      "num_tokens": 23679915.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2709388732910156,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 846.0,
      "completions/mean_length": 695.78125,
      "completions/mean_terminated_length": 648.8928833007812,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 2.5092592592592595,
      "grad_norm": 0.5731926671606395,
      "kl": 0.269775390625,
      "learning_rate": 2.6124474647295137e-07,
      "loss": -0.0033,
      "num_tokens": 23708484.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.02981424145400524,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 683.625,
      "completions/mean_terminated_length": 660.933349609375,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 2.5123456790123457,
      "grad_norm": 1.1484305020897618,
      "kl": 0.278076171875,
      "learning_rate": 2.607452951127107e-07,
      "loss": -0.0489,
      "num_tokens": 23737172.0,
      "reward": 0.0,
      "reward_std": 0.1328706294298172,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 796.5,
      "completions/mean_terminated_length": 693.0909423828125,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 2.515432098765432,
      "grad_norm": 0.8305244512573621,
      "kl": 0.2659912109375,
      "learning_rate": 2.6024580078244777e-07,
      "loss": 0.0045,
      "num_tokens": 23769328.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 675.71875,
      "completions/mean_terminated_length": 625.9642944335938,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 2.5185185185185186,
      "grad_norm": 1.4278326343250904,
      "kl": 0.2794189453125,
      "learning_rate": 2.5974626547962127e-07,
      "loss": -0.0616,
      "num_tokens": 23797291.0,
      "reward": 0.0,
      "reward_std": 0.19526955485343933,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096889972687,
      "step": 816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 742.8125,
      "completions/mean_terminated_length": 649.0833740234375,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 2.521604938271605,
      "grad_norm": 0.6616642904593187,
      "kl": 0.2615966796875,
      "learning_rate": 2.5924669120185373e-07,
      "loss": 0.0081,
      "num_tokens": 23827093.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 750.75,
      "completions/mean_terminated_length": 700.1481323242188,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 2.5246913580246915,
      "grad_norm": 0.7219404271245976,
      "kl": 0.2576904296875,
      "learning_rate": 2.5874707994692333e-07,
      "loss": -0.0178,
      "num_tokens": 23857377.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 706.65625,
      "completions/mean_terminated_length": 673.8275756835938,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 2.5277777777777777,
      "grad_norm": 0.8481421154306074,
      "kl": 0.281005859375,
      "learning_rate": 2.582474337127564e-07,
      "loss": -0.0261,
      "num_tokens": 23886670.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 688.15625,
      "completions/mean_terminated_length": 653.413818359375,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 2.5308641975308643,
      "grad_norm": 1.0360589548354595,
      "kl": 0.2630615234375,
      "learning_rate": 2.5774775449741903e-07,
      "loss": -0.028,
      "num_tokens": 23915343.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15887999534606934,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 729.78125,
      "completions/mean_terminated_length": 687.7500610351562,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 2.5339506172839505,
      "grad_norm": 1.0788661192732718,
      "kl": 0.277099609375,
      "learning_rate": 2.572480442991092e-07,
      "loss": -0.0307,
      "num_tokens": 23945272.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 766.90625,
      "completions/mean_terminated_length": 681.2083740234375,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 2.537037037037037,
      "grad_norm": 0.6432639315080375,
      "kl": 0.271484375,
      "learning_rate": 2.567483051161487e-07,
      "loss": 0.0367,
      "num_tokens": 23976409.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 702.53125,
      "completions/mean_terminated_length": 643.0,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 2.5401234567901234,
      "grad_norm": 1.0878890966955206,
      "kl": 0.287109375,
      "learning_rate": 2.562485389469754e-07,
      "loss": 0.0382,
      "num_tokens": 24004966.0,
      "reward": 0.0,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 679.59375,
      "completions/mean_terminated_length": 643.9655151367188,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 2.5432098765432096,
      "grad_norm": 1.2688152577808474,
      "kl": 0.301513671875,
      "learning_rate": 2.5574874779013494e-07,
      "loss": 0.0335,
      "num_tokens": 24032845.0,
      "reward": 0.0,
      "reward_std": 0.23370715975761414,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 842.0,
      "completions/mean_length": 681.78125,
      "completions/mean_terminated_length": 618.4074096679688,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 2.5462962962962963,
      "grad_norm": 1.3680426474702223,
      "kl": 0.301025390625,
      "learning_rate": 2.5524893364427307e-07,
      "loss": -0.0296,
      "num_tokens": 24060926.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.13619740307331085,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 860.0,
      "completions/mean_length": 720.09375,
      "completions/mean_terminated_length": 635.0,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 2.549382716049383,
      "grad_norm": 1.3668319384193501,
      "kl": 0.2723388671875,
      "learning_rate": 2.547490985081272e-07,
      "loss": -0.0103,
      "num_tokens": 24090577.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19500115513801575,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 667.03125,
      "completions/mean_terminated_length": 630.1034545898438,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 2.552469135802469,
      "grad_norm": 1.5669038390289485,
      "kl": 0.2872314453125,
      "learning_rate": 2.5424924438051896e-07,
      "loss": 0.0276,
      "num_tokens": 24118426.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.20564739406108856,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 839.0,
      "completions/mean_length": 643.78125,
      "completions/mean_terminated_length": 631.51611328125,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 2.5555555555555554,
      "grad_norm": 1.0590135894169592,
      "kl": 0.29736328125,
      "learning_rate": 2.5374937326034575e-07,
      "loss": -0.0271,
      "num_tokens": 24145019.0,
      "reward": 0.0,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 809.375,
      "completions/mean_terminated_length": 725.3912963867188,
      "completions/min_length": 560.0,
      "completions/min_terminated_length": 560.0,
      "epoch": 2.558641975308642,
      "grad_norm": 1.1239317310925958,
      "kl": 0.307373046875,
      "learning_rate": 2.5324948714657287e-07,
      "loss": -0.0269,
      "num_tokens": 24177311.0,
      "reward": 0.0,
      "reward_std": 0.19456422328948975,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 764.40625,
      "completions/mean_terminated_length": 727.3214721679688,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 2.5617283950617287,
      "grad_norm": 1.0366865539180945,
      "kl": 0.288818359375,
      "learning_rate": 2.527495880382259e-07,
      "loss": -0.0707,
      "num_tokens": 24208148.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 724.71875,
      "completions/mean_terminated_length": 640.9199829101562,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 2.564814814814815,
      "grad_norm": 1.2986288395674328,
      "kl": 0.289306640625,
      "learning_rate": 2.522496779343819e-07,
      "loss": -0.0275,
      "num_tokens": 24237679.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 843.0,
      "completions/mean_length": 652.28125,
      "completions/mean_terminated_length": 627.5000610351562,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 2.567901234567901,
      "grad_norm": 0.8025661060787778,
      "kl": 0.3109130859375,
      "learning_rate": 2.5174975883416237e-07,
      "loss": 0.0056,
      "num_tokens": 24264684.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 729.59375,
      "completions/mean_terminated_length": 675.0740966796875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 2.5709876543209877,
      "grad_norm": 0.08401355068948127,
      "kl": 0.310546875,
      "learning_rate": 2.512498327367245e-07,
      "loss": 0.0003,
      "num_tokens": 24294691.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 734.125,
      "completions/mean_terminated_length": 704.137939453125,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 2.574074074074074,
      "grad_norm": 1.0560475472697004,
      "kl": 0.286865234375,
      "learning_rate": 2.5074990164125355e-07,
      "loss": -0.0448,
      "num_tokens": 24324623.0,
      "reward": 0.0,
      "reward_std": 0.12547743320465088,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 756.5625,
      "completions/mean_terminated_length": 707.0370483398438,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 2.5771604938271606,
      "grad_norm": 0.5883132573063482,
      "kl": 0.276611328125,
      "learning_rate": 2.502499675469547e-07,
      "loss": 0.0204,
      "num_tokens": 24355029.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 745.28125,
      "completions/mean_terminated_length": 705.4642944335938,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 2.580246913580247,
      "grad_norm": 0.7565401852315675,
      "kl": 0.262451171875,
      "learning_rate": 2.497500324530453e-07,
      "loss": -0.0087,
      "num_tokens": 24385450.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 733.59375,
      "completions/mean_terminated_length": 692.107177734375,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 2.5833333333333335,
      "grad_norm": 0.8986246072885274,
      "kl": 0.2607421875,
      "learning_rate": 2.4925009835874643e-07,
      "loss": 0.0001,
      "num_tokens": 24415165.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 687.3125,
      "completions/mean_terminated_length": 664.86669921875,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 2.5864197530864197,
      "grad_norm": 0.7745117491811915,
      "kl": 0.287353515625,
      "learning_rate": 2.4875016726327555e-07,
      "loss": 0.0201,
      "num_tokens": 24443231.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 737.46875,
      "completions/mean_terminated_length": 684.4074096679688,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 2.5895061728395063,
      "grad_norm": 1.27069380642628,
      "kl": 0.25390625,
      "learning_rate": 2.482502411658376e-07,
      "loss": 0.0084,
      "num_tokens": 24473338.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.18805178999900818,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 5.587935447692871e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 743.1875,
      "completions/mean_terminated_length": 664.5599975585938,
      "completions/min_length": 515.0,
      "completions/min_terminated_length": 515.0,
      "epoch": 2.5925925925925926,
      "grad_norm": 1.0143959017732704,
      "kl": 0.28515625,
      "learning_rate": 2.477503220656181e-07,
      "loss": 0.0347,
      "num_tokens": 24503412.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.16838626563549042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -2.7939677238464355e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 722.15625,
      "completions/mean_terminated_length": 666.25927734375,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 2.5956790123456788,
      "grad_norm": 0.4954769982377692,
      "kl": 0.27880859375,
      "learning_rate": 2.472504119617742e-07,
      "loss": 0.0227,
      "num_tokens": 24532537.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 782.9375,
      "completions/mean_terminated_length": 748.5000610351562,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 2.5987654320987654,
      "grad_norm": 1.1298471444079328,
      "kl": 0.260009765625,
      "learning_rate": 2.4675051285342716e-07,
      "loss": -0.0419,
      "num_tokens": 24564011.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.07559289038181305,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 745.0,
      "completions/mean_terminated_length": 693.3333129882812,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 2.601851851851852,
      "grad_norm": 0.6878738170747613,
      "kl": 0.3057861328125,
      "learning_rate": 2.462506267396543e-07,
      "loss": -0.0049,
      "num_tokens": 24594623.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 783.25,
      "completions/mean_terminated_length": 703.0,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 2.6049382716049383,
      "grad_norm": 0.5058747461535761,
      "kl": 0.29638671875,
      "learning_rate": 2.45750755619481e-07,
      "loss": 0.0012,
      "num_tokens": 24626243.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 819.0,
      "completions/mean_length": 668.65625,
      "completions/mean_terminated_length": 617.8928833007812,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 2.6080246913580245,
      "grad_norm": 1.3353346300638027,
      "kl": 0.2615966796875,
      "learning_rate": 2.452509014918728e-07,
      "loss": 0.0659,
      "num_tokens": 24653920.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1300242692232132,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 828.0,
      "completions/mean_length": 720.34375,
      "completions/mean_terminated_length": 635.3200073242188,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 2.611111111111111,
      "grad_norm": 1.1611740377427653,
      "kl": 0.257080078125,
      "learning_rate": 2.4475106635572696e-07,
      "loss": 0.0056,
      "num_tokens": 24683467.0,
      "reward": 0.0,
      "reward_std": 0.16916194558143616,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 718.96875,
      "completions/mean_terminated_length": 675.3928833007812,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 2.6141975308641974,
      "grad_norm": 1.3708056657621666,
      "kl": 0.293212890625,
      "learning_rate": 2.4425125220986503e-07,
      "loss": -0.0198,
      "num_tokens": 24713138.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13394224643707275,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 687.9375,
      "completions/mean_terminated_length": 639.9285888671875,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 2.617283950617284,
      "grad_norm": 0.953915107942522,
      "kl": 0.2908935546875,
      "learning_rate": 2.437514610530246e-07,
      "loss": -0.0103,
      "num_tokens": 24741080.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 729.40625,
      "completions/mean_terminated_length": 709.7667236328125,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 2.6203703703703702,
      "grad_norm": 0.6262904933841149,
      "kl": 0.264892578125,
      "learning_rate": 2.4325169488385137e-07,
      "loss": -0.0066,
      "num_tokens": 24770901.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 719.34375,
      "completions/mean_terminated_length": 649.0385131835938,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 2.623456790123457,
      "grad_norm": 1.1377923496997484,
      "kl": 0.2659912109375,
      "learning_rate": 2.4275195570089083e-07,
      "loss": -0.0015,
      "num_tokens": 24800116.0,
      "reward": 0.0,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 822.9375,
      "completions/mean_terminated_length": 702.2999877929688,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 2.626543209876543,
      "grad_norm": 1.588263612638342,
      "kl": 0.275634765625,
      "learning_rate": 2.42252245502581e-07,
      "loss": -0.0301,
      "num_tokens": 24833350.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.2094171941280365,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 808.625,
      "completions/mean_terminated_length": 736.8333740234375,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 2.6296296296296298,
      "grad_norm": 1.2086816185668496,
      "kl": 0.263671875,
      "learning_rate": 2.417525662872436e-07,
      "loss": -0.0232,
      "num_tokens": 24866138.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 725.90625,
      "completions/mean_terminated_length": 683.3214721679688,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 2.632716049382716,
      "grad_norm": 1.4309719486748265,
      "kl": 0.2666015625,
      "learning_rate": 2.412529200530767e-07,
      "loss": -0.0447,
      "num_tokens": 24895843.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20958730578422546,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 751.28125,
      "completions/mean_terminated_length": 712.3214721679688,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 2.6358024691358026,
      "grad_norm": 0.8428307152060934,
      "kl": 0.2628173828125,
      "learning_rate": 2.407533087981463e-07,
      "loss": -0.0232,
      "num_tokens": 24926684.0,
      "reward": 0.0,
      "reward_std": 0.12123563885688782,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 653.0625,
      "completions/mean_terminated_length": 641.0967407226562,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 2.638888888888889,
      "grad_norm": 1.4396347735888524,
      "kl": 0.2926025390625,
      "learning_rate": 2.4025373452037865e-07,
      "loss": -0.0344,
      "num_tokens": 24953906.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.20773616433143616,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 746.0625,
      "completions/mean_terminated_length": 681.923095703125,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 2.6419753086419755,
      "grad_norm": 1.3373908056729427,
      "kl": 0.30126953125,
      "learning_rate": 2.3975419921755215e-07,
      "loss": 0.0316,
      "num_tokens": 24983972.0,
      "reward": 0.0,
      "reward_std": 0.15817604959011078,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 726.75,
      "completions/mean_terminated_length": 658.1538696289062,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 2.6450617283950617,
      "grad_norm": 1.386289547194358,
      "kl": 0.264892578125,
      "learning_rate": 2.3925470488728935e-07,
      "loss": -0.0423,
      "num_tokens": 25013972.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.14240965247154236,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 661.8125,
      "completions/mean_terminated_length": 650.1290283203125,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 2.648148148148148,
      "grad_norm": 0.9448071870602488,
      "kl": 0.2669677734375,
      "learning_rate": 2.3875525352704866e-07,
      "loss": -0.0172,
      "num_tokens": 25041362.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 676.0,
      "completions/mean_terminated_length": 652.800048828125,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 2.6512345679012346,
      "grad_norm": 1.0594595146085641,
      "kl": 0.2581787109375,
      "learning_rate": 2.38255847134117e-07,
      "loss": 0.0087,
      "num_tokens": 25069366.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 793.6875,
      "completions/mean_terminated_length": 673.047607421875,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 2.6543209876543212,
      "grad_norm": 1.750820591589366,
      "kl": 0.2794189453125,
      "learning_rate": 2.3775648770560126e-07,
      "loss": -0.0002,
      "num_tokens": 25101712.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2277967929840088,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 778.875,
      "completions/mean_terminated_length": 710.239990234375,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 2.6574074074074074,
      "grad_norm": 1.1716960640053096,
      "kl": 0.253662109375,
      "learning_rate": 2.3725717723842066e-07,
      "loss": -0.0033,
      "num_tokens": 25133112.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 773.78125,
      "completions/mean_terminated_length": 690.375,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 2.6604938271604937,
      "grad_norm": 0.5074171873813702,
      "kl": 0.305419921875,
      "learning_rate": 2.3675791772929862e-07,
      "loss": 0.0246,
      "num_tokens": 25164653.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 698.65625,
      "completions/mean_terminated_length": 652.1785888671875,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 2.6635802469135803,
      "grad_norm": 0.9363851628606226,
      "kl": 0.2802734375,
      "learning_rate": 2.3625871117475466e-07,
      "loss": 0.0002,
      "num_tokens": 25193398.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.12879827618598938,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 710.21875,
      "completions/mean_terminated_length": 652.1111450195312,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 2.6666666666666665,
      "grad_norm": 0.8397991235587109,
      "kl": 0.268310546875,
      "learning_rate": 2.357595595710967e-07,
      "loss": -0.0081,
      "num_tokens": 25222269.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 718.28125,
      "completions/mean_terminated_length": 661.6666870117188,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 2.669753086419753,
      "grad_norm": 2.147673799202004,
      "kl": 0.27978515625,
      "learning_rate": 2.3526046491441277e-07,
      "loss": -0.0447,
      "num_tokens": 25251902.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.17102104425430298,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 5.122274160385132e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 762.28125,
      "completions/mean_terminated_length": 675.0416870117188,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 2.6728395061728394,
      "grad_norm": 1.3506779003869993,
      "kl": 0.320068359375,
      "learning_rate": 2.3476142920056315e-07,
      "loss": -0.0104,
      "num_tokens": 25282779.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14842106401920319,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 760.34375,
      "completions/mean_terminated_length": 722.6785888671875,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 2.675925925925926,
      "grad_norm": 0.9226034555074042,
      "kl": 0.28125,
      "learning_rate": 2.3426245442517254e-07,
      "loss": 0.0013,
      "num_tokens": 25313570.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.12777692079544067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 613.21875,
      "completions/mean_terminated_length": 599.9677124023438,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 2.6790123456790123,
      "grad_norm": 1.2276290914825594,
      "kl": 0.29150390625,
      "learning_rate": 2.3376354258362185e-07,
      "loss": -0.0218,
      "num_tokens": 25339257.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19173535704612732,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 792.75,
      "completions/mean_terminated_length": 715.6666870117188,
      "completions/min_length": 540.0,
      "completions/min_terminated_length": 540.0,
      "epoch": 2.682098765432099,
      "grad_norm": 0.5015083810127813,
      "kl": 0.2744140625,
      "learning_rate": 2.3326469567104044e-07,
      "loss": 0.0187,
      "num_tokens": 25371121.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 715.1875,
      "completions/mean_terminated_length": 643.923095703125,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 2.685185185185185,
      "grad_norm": 0.011519412352654029,
      "kl": 0.294677734375,
      "learning_rate": 2.3276591568229787e-07,
      "loss": 0.0003,
      "num_tokens": 25400171.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 775.65625,
      "completions/mean_terminated_length": 706.1199951171875,
      "completions/min_length": 550.0,
      "completions/min_terminated_length": 550.0,
      "epoch": 2.6882716049382713,
      "grad_norm": 1.0218626235133945,
      "kl": 0.2607421875,
      "learning_rate": 2.3226720461199626e-07,
      "loss": -0.044,
      "num_tokens": 25432148.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 832.0,
      "completions/mean_length": 849.0,
      "completions/mean_terminated_length": 694.5882568359375,
      "completions/min_length": 573.0,
      "completions/min_terminated_length": 573.0,
      "epoch": 2.691358024691358,
      "grad_norm": 0.5338448545770911,
      "kl": 0.2791748046875,
      "learning_rate": 2.3176856445446187e-07,
      "loss": 0.0251,
      "num_tokens": 25466384.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 726.125,
      "completions/mean_terminated_length": 695.3103637695312,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 2.6944444444444446,
      "grad_norm": 1.1476966935091446,
      "kl": 0.258056640625,
      "learning_rate": 2.3126999720373757e-07,
      "loss": -0.0227,
      "num_tokens": 25496188.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.17643702030181885,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 821.59375,
      "completions/mean_terminated_length": 742.3912963867188,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 2.697530864197531,
      "grad_norm": 0.9983732012570077,
      "kl": 0.2777099609375,
      "learning_rate": 2.3077150485357477e-07,
      "loss": 0.001,
      "num_tokens": 25528835.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 790.125,
      "completions/mean_terminated_length": 724.6399536132812,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 2.700617283950617,
      "grad_norm": 0.6854490421533415,
      "kl": 0.2733154296875,
      "learning_rate": 2.3027308939742502e-07,
      "loss": -0.0212,
      "num_tokens": 25561083.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 741.53125,
      "completions/mean_terminated_length": 662.4400024414062,
      "completions/min_length": 266.0,
      "completions/min_terminated_length": 266.0,
      "epoch": 2.7037037037037037,
      "grad_norm": 1.2926251434162352,
      "kl": 0.2996826171875,
      "learning_rate": 2.2977475282843266e-07,
      "loss": 0.0043,
      "num_tokens": 25591296.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15677526593208313,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 688.09375,
      "completions/mean_terminated_length": 677.258056640625,
      "completions/min_length": 380.0,
      "completions/min_terminated_length": 380.0,
      "epoch": 2.7067901234567904,
      "grad_norm": 0.609544326786698,
      "kl": 0.273681640625,
      "learning_rate": 2.292764971394265e-07,
      "loss": 0.0108,
      "num_tokens": 25619547.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 711.0625,
      "completions/mean_terminated_length": 638.84619140625,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 2.7098765432098766,
      "grad_norm": 0.8579878365838796,
      "kl": 0.284912109375,
      "learning_rate": 2.2877832432291188e-07,
      "loss": -0.001,
      "num_tokens": 25648465.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 731.71875,
      "completions/mean_terminated_length": 701.4827270507812,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 2.712962962962963,
      "grad_norm": 0.8799359219560791,
      "kl": 0.2803955078125,
      "learning_rate": 2.2828023637106273e-07,
      "loss": -0.0087,
      "num_tokens": 25678656.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 721.8125,
      "completions/mean_terminated_length": 678.6428833007812,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 2.7160493827160495,
      "grad_norm": 1.0607403211756514,
      "kl": 0.287109375,
      "learning_rate": 2.2778223527571362e-07,
      "loss": -0.0256,
      "num_tokens": 25708638.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 793.875,
      "completions/mean_terminated_length": 729.4400024414062,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 2.7191358024691357,
      "grad_norm": 1.1726143724672464,
      "kl": 0.274169921875,
      "learning_rate": 2.2728432302835183e-07,
      "loss": -0.0561,
      "num_tokens": 25740870.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.21564045548439026,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 773.15625,
      "completions/mean_terminated_length": 702.9199829101562,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 2.7222222222222223,
      "grad_norm": 0.8321198331902351,
      "kl": 0.273681640625,
      "learning_rate": 2.2678650162010937e-07,
      "loss": 0.022,
      "num_tokens": 25771943.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 751.78125,
      "completions/mean_terminated_length": 661.0416870117188,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 2.7253086419753085,
      "grad_norm": 1.250253132035384,
      "kl": 0.2916259765625,
      "learning_rate": 2.2628877304175472e-07,
      "loss": -0.0401,
      "num_tokens": 25802876.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 696.4375,
      "completions/mean_terminated_length": 662.5516967773438,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 2.728395061728395,
      "grad_norm": 1.1679716124487187,
      "kl": 0.2510986328125,
      "learning_rate": 2.2579113928368548e-07,
      "loss": -0.025,
      "num_tokens": 25831410.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14842106401920319,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 760.3125,
      "completions/mean_terminated_length": 711.4815063476562,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 2.7314814814814814,
      "grad_norm": 3.346991532297941,
      "kl": 1.3779296875,
      "learning_rate": 2.2529360233591997e-07,
      "loss": 0.0036,
      "num_tokens": 25862852.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 712.3125,
      "completions/mean_terminated_length": 640.3846435546875,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 2.734567901234568,
      "grad_norm": 1.5899803042298926,
      "kl": 0.533935546875,
      "learning_rate": 2.2479616418808915e-07,
      "loss": -0.0398,
      "num_tokens": 25891950.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18978948891162872,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 763.96875,
      "completions/mean_terminated_length": 691.1599731445312,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 2.7376543209876543,
      "grad_norm": 1.063542597921851,
      "kl": 0.297607421875,
      "learning_rate": 2.242988268294292e-07,
      "loss": 0.0266,
      "num_tokens": 25923333.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 710.4375,
      "completions/mean_terminated_length": 665.6428833007812,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 2.7407407407407405,
      "grad_norm": 0.5474607759704945,
      "kl": 0.2763671875,
      "learning_rate": 2.23801592248773e-07,
      "loss": 0.0149,
      "num_tokens": 25951927.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 787.90625,
      "completions/mean_terminated_length": 664.2380981445312,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 2.743827160493827,
      "grad_norm": 0.9670648589224451,
      "kl": 0.31005859375,
      "learning_rate": 2.2330446243454265e-07,
      "loss": -0.0183,
      "num_tokens": 25983920.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1265953779220581,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 664.59375,
      "completions/mean_terminated_length": 613.25,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 2.746913580246914,
      "grad_norm": 1.1402925599138931,
      "kl": 0.2781982421875,
      "learning_rate": 2.228074393747412e-07,
      "loss": -0.0077,
      "num_tokens": 26011571.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15863974392414093,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 767.375,
      "completions/mean_terminated_length": 681.8333740234375,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 2.75,
      "grad_norm": 1.3522779112465206,
      "kl": 0.273681640625,
      "learning_rate": 2.2231052505694458e-07,
      "loss": -0.0317,
      "num_tokens": 26042987.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 766.8125,
      "completions/mean_terminated_length": 694.7999877929688,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 2.753086419753086,
      "grad_norm": 0.6430250085969144,
      "kl": 0.2457275390625,
      "learning_rate": 2.2181372146829418e-07,
      "loss": 0.0003,
      "num_tokens": 26074093.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 805.65625,
      "completions/mean_terminated_length": 732.875,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 2.756172839506173,
      "grad_norm": 1.0707958292063462,
      "kl": 0.2540283203125,
      "learning_rate": 2.213170305954884e-07,
      "loss": -0.0176,
      "num_tokens": 26106926.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 727.0625,
      "completions/mean_terminated_length": 672.0740966796875,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 2.7592592592592595,
      "grad_norm": 1.0614821263774508,
      "kl": 0.307373046875,
      "learning_rate": 2.2082045442477497e-07,
      "loss": 0.0029,
      "num_tokens": 26137132.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1258947253227234,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 777.53125,
      "completions/mean_terminated_length": 681.0869750976562,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 2.7623456790123457,
      "grad_norm": 1.570547442403824,
      "kl": 0.2452392578125,
      "learning_rate": 2.2032399494194292e-07,
      "loss": 0.0891,
      "num_tokens": 26168925.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1507483720779419,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 734.78125,
      "completions/mean_terminated_length": 681.2222290039062,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 2.765432098765432,
      "grad_norm": 1.0004121014341636,
      "kl": 0.275634765625,
      "learning_rate": 2.1982765413231466e-07,
      "loss": -0.0032,
      "num_tokens": 26199166.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 771.0,
      "completions/mean_terminated_length": 656.0,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 2.7685185185185186,
      "grad_norm": 1.0749553660395452,
      "kl": 0.29248046875,
      "learning_rate": 2.1933143398073805e-07,
      "loss": 0.0248,
      "num_tokens": 26230666.0,
      "reward": 0.05624999850988388,
      "reward_std": 0.06495190411806107,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0625,
      "rewards/logprob_reward/std": 0.24593468010425568,
      "step": 897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 747.75,
      "completions/mean_terminated_length": 684.0,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 2.771604938271605,
      "grad_norm": 0.8422264113509557,
      "kl": 0.2744140625,
      "learning_rate": 2.1883533647157828e-07,
      "loss": -0.0057,
      "num_tokens": 26261542.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 745.9375,
      "completions/mean_terminated_length": 717.1724243164062,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 2.7746913580246915,
      "grad_norm": 1.7566995325771781,
      "kl": 0.259521484375,
      "learning_rate": 2.1833936358871045e-07,
      "loss": 0.0919,
      "num_tokens": 26292444.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.22395765781402588,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.43994131684303284,
      "step": 899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 734.46875,
      "completions/mean_terminated_length": 704.5172119140625,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 2.7777777777777777,
      "grad_norm": 0.01226161702759194,
      "kl": 0.28515625,
      "learning_rate": 2.1784351731551077e-07,
      "loss": 0.0003,
      "num_tokens": 26322183.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 796.46875,
      "completions/mean_terminated_length": 707.434814453125,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 2.7808641975308643,
      "grad_norm": 1.2244764487572017,
      "kl": 0.26123046875,
      "learning_rate": 2.1734779963484959e-07,
      "loss": 0.0212,
      "num_tokens": 26354178.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.19498921930789948,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 813.0625,
      "completions/mean_terminated_length": 774.0,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "epoch": 2.7839506172839505,
      "grad_norm": 1.287437015814438,
      "kl": 0.2586669921875,
      "learning_rate": 2.1685221252908282e-07,
      "loss": 0.0242,
      "num_tokens": 26387348.0,
      "reward": 0.0,
      "reward_std": 0.21456822752952576,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 749.96875,
      "completions/mean_terminated_length": 686.7307739257812,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 2.787037037037037,
      "grad_norm": 1.41860485491689,
      "kl": 0.3116455078125,
      "learning_rate": 2.163567579800443e-07,
      "loss": 0.0032,
      "num_tokens": 26418071.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 775.59375,
      "completions/mean_terminated_length": 678.3912963867188,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 2.7901234567901234,
      "grad_norm": 1.054656133279091,
      "kl": 0.2364501953125,
      "learning_rate": 2.1586143796903775e-07,
      "loss": -0.0097,
      "num_tokens": 26449738.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.17700429260730743,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 866.0,
      "completions/mean_length": 717.15625,
      "completions/mean_terminated_length": 673.3214721679688,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 2.7932098765432096,
      "grad_norm": 1.1037996347432708,
      "kl": 0.265625,
      "learning_rate": 2.1536625447682877e-07,
      "loss": 0.012,
      "num_tokens": 26478983.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14842106401920319,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 767.375,
      "completions/mean_terminated_length": 650.727294921875,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 2.7962962962962963,
      "grad_norm": 1.6015105918910906,
      "kl": 0.262451171875,
      "learning_rate": 2.1487120948363713e-07,
      "loss": -0.0399,
      "num_tokens": 26509807.0,
      "reward": 0.0,
      "reward_std": 0.2630038857460022,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 5.587935447692871e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 777.8125,
      "completions/mean_terminated_length": 665.9091186523438,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 2.799382716049383,
      "grad_norm": 0.7547204069363477,
      "kl": 0.2603759765625,
      "learning_rate": 2.1437630496912889e-07,
      "loss": 0.0161,
      "num_tokens": 26541777.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 842.5625,
      "completions/mean_terminated_length": 733.7000122070312,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 2.802469135802469,
      "grad_norm": 0.9650770767730723,
      "kl": 0.2747802734375,
      "learning_rate": 2.1388154291240794e-07,
      "loss": 0.021,
      "num_tokens": 26575815.0,
      "reward": 0.0,
      "reward_std": 0.15334317088127136,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 679.5,
      "completions/mean_terminated_length": 630.2857666015625,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 2.8055555555555554,
      "grad_norm": 0.8565306768480461,
      "kl": 0.2716064453125,
      "learning_rate": 2.133869252920089e-07,
      "loss": -0.0256,
      "num_tokens": 26603619.0,
      "reward": 0.0,
      "reward_std": 0.126713365316391,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 719.1875,
      "completions/mean_terminated_length": 648.84619140625,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 2.808641975308642,
      "grad_norm": 1.2245127423131754,
      "kl": 0.2950439453125,
      "learning_rate": 2.128924540858885e-07,
      "loss": -0.0015,
      "num_tokens": 26632957.0,
      "reward": 0.0,
      "reward_std": 0.14260268211364746,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 734.5625,
      "completions/mean_terminated_length": 693.2142944335938,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 2.8117283950617287,
      "grad_norm": 0.8344859032295361,
      "kl": 0.2655029296875,
      "learning_rate": 2.1239813127141828e-07,
      "loss": 0.0041,
      "num_tokens": 26662559.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 739.8125,
      "completions/mean_terminated_length": 674.2307739257812,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 2.814814814814815,
      "grad_norm": 0.7417314091719092,
      "kl": 0.2696533203125,
      "learning_rate": 2.1190395882537598e-07,
      "loss": 0.0198,
      "num_tokens": 26692685.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 727.78125,
      "completions/mean_terminated_length": 659.423095703125,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 2.817901234567901,
      "grad_norm": 1.030812093252084,
      "kl": 0.2685546875,
      "learning_rate": 2.1140993872393833e-07,
      "loss": -0.0115,
      "num_tokens": 26722202.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 666.65625,
      "completions/mean_terminated_length": 615.607177734375,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 2.8209876543209877,
      "grad_norm": 0.009137411678997759,
      "kl": 0.306884765625,
      "learning_rate": 2.1091607294267269e-07,
      "loss": 0.0003,
      "num_tokens": 26750283.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 731.6875,
      "completions/mean_terminated_length": 664.2307739257812,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 2.824074074074074,
      "grad_norm": 1.4408771794989097,
      "kl": 0.25830078125,
      "learning_rate": 2.1042236345652947e-07,
      "loss": -0.0312,
      "num_tokens": 26780637.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.24957561492919922,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 796.34375,
      "completions/mean_terminated_length": 732.5999755859375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 2.8271604938271606,
      "grad_norm": 0.8706589470234878,
      "kl": 0.2572021484375,
      "learning_rate": 2.0992881223983368e-07,
      "loss": -0.0138,
      "num_tokens": 26812788.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 715.03125,
      "completions/mean_terminated_length": 643.7307739257812,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 2.830246913580247,
      "grad_norm": 0.9545348220695438,
      "kl": 0.294189453125,
      "learning_rate": 2.0943542126627784e-07,
      "loss": -0.0341,
      "num_tokens": 26842329.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 750.5625,
      "completions/mean_terminated_length": 699.9259033203125,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 2.8333333333333335,
      "grad_norm": 1.076229224187121,
      "kl": 0.258056640625,
      "learning_rate": 2.0894219250891352e-07,
      "loss": 0.0413,
      "num_tokens": 26873135.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1746388077735901,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 852.0,
      "completions/mean_length": 777.25,
      "completions/mean_terminated_length": 680.6956787109375,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 2.8364197530864197,
      "grad_norm": 0.810129740101571,
      "kl": 0.300537109375,
      "learning_rate": 2.0844912794014341e-07,
      "loss": 0.0126,
      "num_tokens": 26904691.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 718.5625,
      "completions/mean_terminated_length": 648.0769653320312,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 2.8395061728395063,
      "grad_norm": 1.028209853074001,
      "kl": 0.2764892578125,
      "learning_rate": 2.079562295317139e-07,
      "loss": 0.0078,
      "num_tokens": 26934061.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 717.25,
      "completions/mean_terminated_length": 660.4444580078125,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 2.8425925925925926,
      "grad_norm": 0.6148676306692956,
      "kl": 0.2845458984375,
      "learning_rate": 2.0746349925470672e-07,
      "loss": 0.0091,
      "num_tokens": 26963113.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 723.75,
      "completions/mean_terminated_length": 654.4615478515625,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 2.8456790123456788,
      "grad_norm": 1.5714184184051678,
      "kl": 0.2623291015625,
      "learning_rate": 2.0697093907953134e-07,
      "loss": 0.0209,
      "num_tokens": 26992537.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.22554031014442444,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 766.34375,
      "completions/mean_terminated_length": 694.2000122070312,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 2.8487654320987654,
      "grad_norm": 0.9648933776455749,
      "kl": 0.2469482421875,
      "learning_rate": 2.0647855097591704e-07,
      "loss": -0.0177,
      "num_tokens": 27023660.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15884052217006683,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 745.5,
      "completions/mean_terminated_length": 693.9259033203125,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 2.851851851851852,
      "grad_norm": 0.6825799829565506,
      "kl": 0.26904296875,
      "learning_rate": 2.0598633691290485e-07,
      "loss": 0.03,
      "num_tokens": 27054124.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 746.71875,
      "completions/mean_terminated_length": 654.2916870117188,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 2.8549382716049383,
      "grad_norm": 0.7756896185517373,
      "kl": 0.28369140625,
      "learning_rate": 2.054942988588399e-07,
      "loss": 0.005,
      "num_tokens": 27085243.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 689.09375,
      "completions/mean_terminated_length": 654.4483032226562,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 2.8580246913580245,
      "grad_norm": 1.2003046392689023,
      "kl": 0.2535400390625,
      "learning_rate": 2.050024387813634e-07,
      "loss": 0.0221,
      "num_tokens": 27113410.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15639330446720123,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 665.53125,
      "completions/mean_terminated_length": 641.6333618164062,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 2.861111111111111,
      "grad_norm": 1.390477833783607,
      "kl": 0.26611328125,
      "learning_rate": 2.0451075864740496e-07,
      "loss": -0.0009,
      "num_tokens": 27140931.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.17154675722122192,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 672.15625,
      "completions/mean_terminated_length": 607.0,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 2.8641975308641974,
      "grad_norm": 0.9431352061890065,
      "kl": 0.3013916015625,
      "learning_rate": 2.0401926042317455e-07,
      "loss": 0.0218,
      "num_tokens": 27168744.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 928
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 750.0,
      "completions/mean_terminated_length": 686.7692260742188,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 2.867283950617284,
      "grad_norm": 1.28669571470756,
      "kl": NaN,
      "learning_rate": 2.0352794607415465e-07,
      "loss": 0.0356,
      "num_tokens": 27199136.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.18770192563533783,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 708.53125,
      "completions/mean_terminated_length": 650.1111450195312,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 2.8703703703703702,
      "grad_norm": 1.0583589591588964,
      "kl": 0.270263671875,
      "learning_rate": 2.0303681756509254e-07,
      "loss": -0.0197,
      "num_tokens": 27228653.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 857.0,
      "completions/mean_length": 742.15625,
      "completions/mean_terminated_length": 677.1154174804688,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 2.873456790123457,
      "grad_norm": 0.8061770656390963,
      "kl": 0.2479248046875,
      "learning_rate": 2.0254587685999215e-07,
      "loss": -0.021,
      "num_tokens": 27258490.0,
      "reward": 0.0,
      "reward_std": 0.14086535573005676,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 931
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 730.5625,
      "completions/mean_terminated_length": 688.6428833007812,
      "completions/min_length": 498.0,
      "completions/min_terminated_length": 498.0,
      "epoch": 2.876543209876543,
      "grad_norm": 0.6225019361821078,
      "kl": 0.298828125,
      "learning_rate": 2.020551259221066e-07,
      "loss": 0.0032,
      "num_tokens": 27288048.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 720.84375,
      "completions/mean_terminated_length": 664.7037353515625,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 2.8796296296296298,
      "grad_norm": 0.5814236458811808,
      "kl": 0.26953125,
      "learning_rate": 2.0156456671392988e-07,
      "loss": 0.012,
      "num_tokens": 27317987.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 768.96875,
      "completions/mean_terminated_length": 721.74072265625,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 2.882716049382716,
      "grad_norm": 0.8938298014596324,
      "kl": 0.27099609375,
      "learning_rate": 2.010742011971895e-07,
      "loss": 0.0008,
      "num_tokens": 27349590.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 699.90625,
      "completions/mean_terminated_length": 653.607177734375,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 2.8858024691358026,
      "grad_norm": 0.9615668383520513,
      "kl": 0.2698974609375,
      "learning_rate": 2.005840313328383e-07,
      "loss": -0.0073,
      "num_tokens": 27378359.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 770.1875,
      "completions/mean_terminated_length": 699.1199951171875,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 2.888888888888889,
      "grad_norm": 1.093017923551296,
      "kl": 0.2379150390625,
      "learning_rate": 2.0009405908104673e-07,
      "loss": 0.0117,
      "num_tokens": 27410009.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.14578182995319366,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 731.9375,
      "completions/mean_terminated_length": 664.5385131835938,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 2.8919753086419755,
      "grad_norm": 1.5894945367007276,
      "kl": 0.28271484375,
      "learning_rate": 1.996042864011951e-07,
      "loss": -0.034,
      "num_tokens": 27439831.0,
      "reward": 0.0,
      "reward_std": 0.25309431552886963,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 709.03125,
      "completions/mean_terminated_length": 664.0357666015625,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 2.8950617283950617,
      "grad_norm": 1.1271701128434652,
      "kl": 0.2523193359375,
      "learning_rate": 1.9911471525186534e-07,
      "loss": 0.0089,
      "num_tokens": 27469216.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 749.6875,
      "completions/mean_terminated_length": 710.5000610351562,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 2.898148148148148,
      "grad_norm": 1.4096152948567902,
      "kl": 0.2711181640625,
      "learning_rate": 1.9862534759083379e-07,
      "loss": 0.0582,
      "num_tokens": 27500262.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.2868836522102356,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 637.5,
      "completions/mean_terminated_length": 611.7333374023438,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 2.9012345679012346,
      "grad_norm": 1.71495424482088,
      "kl": 0.28076171875,
      "learning_rate": 1.9813618537506302e-07,
      "loss": 0.0773,
      "num_tokens": 27526690.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.2652709484100342,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 880.0,
      "completions/mean_length": 716.34375,
      "completions/mean_terminated_length": 630.2000122070312,
      "completions/min_length": 314.0,
      "completions/min_terminated_length": 314.0,
      "epoch": 2.9043209876543212,
      "grad_norm": 1.3088744357136437,
      "kl": 0.26611328125,
      "learning_rate": 1.9764723056069365e-07,
      "loss": -0.0102,
      "num_tokens": 27556405.0,
      "reward": 0.0,
      "reward_std": 0.1590607464313507,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 678.3125,
      "completions/mean_terminated_length": 581.5199584960938,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 2.9074074074074074,
      "grad_norm": 2.092974041961588,
      "kl": 0.2655029296875,
      "learning_rate": 1.9715848510303739e-07,
      "loss": -0.0471,
      "num_tokens": 27584243.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.21604543924331665,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 731.25,
      "completions/mean_terminated_length": 677.0370483398438,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 2.9104938271604937,
      "grad_norm": 1.3904426704990516,
      "kl": 0.2652587890625,
      "learning_rate": 1.966699509565685e-07,
      "loss": 0.0217,
      "num_tokens": 27614263.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19446446001529694,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 706.75,
      "completions/mean_terminated_length": 661.4285888671875,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 2.9135802469135803,
      "grad_norm": 0.9308651722253931,
      "kl": 0.26953125,
      "learning_rate": 1.961816300749163e-07,
      "loss": -0.0338,
      "num_tokens": 27643043.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1556938886642456,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 657.28125,
      "completions/mean_terminated_length": 632.8333740234375,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 2.9166666666666665,
      "grad_norm": 0.7422821322314068,
      "kl": 0.2630615234375,
      "learning_rate": 1.9569352441085712e-07,
      "loss": 0.0029,
      "num_tokens": 27670192.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 721.75,
      "completions/mean_terminated_length": 637.1199951171875,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 2.919753086419753,
      "grad_norm": 2.666645959207306,
      "kl": 0.272216796875,
      "learning_rate": 1.9520563591630686e-07,
      "loss": -0.0778,
      "num_tokens": 27700040.0,
      "reward": 0.0,
      "reward_std": 0.2386818826198578,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.4016096889972687,
      "step": 946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 930.0,
      "completions/mean_length": 676.125,
      "completions/mean_terminated_length": 626.4285888671875,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 2.9228395061728394,
      "grad_norm": 0.5929827461269377,
      "kl": 0.2696533203125,
      "learning_rate": 1.9471796654231278e-07,
      "loss": 0.0212,
      "num_tokens": 27728256.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 733.59375,
      "completions/mean_terminated_length": 666.5769653320312,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 2.925925925925926,
      "grad_norm": 0.8802088847347731,
      "kl": 0.25,
      "learning_rate": 1.9423051823904602e-07,
      "loss": 0.0108,
      "num_tokens": 27758147.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 755.03125,
      "completions/mean_terminated_length": 679.719970703125,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 2.9290123456790123,
      "grad_norm": 0.7138798730919996,
      "kl": 0.2115478515625,
      "learning_rate": 1.9374329295579372e-07,
      "loss": 0.0128,
      "num_tokens": 27788944.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 754.6875,
      "completions/mean_terminated_length": 664.9166870117188,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 2.932098765432099,
      "grad_norm": 0.9512255663617465,
      "kl": 0.260009765625,
      "learning_rate": 1.9325629264095083e-07,
      "loss": -0.0188,
      "num_tokens": 27819590.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.1554727405309677,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 722.625,
      "completions/mean_terminated_length": 653.0769653320312,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 2.935185185185185,
      "grad_norm": 1.409302550476725,
      "kl": 0.259765625,
      "learning_rate": 1.9276951924201304e-07,
      "loss": 0.063,
      "num_tokens": 27849070.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 718.40625,
      "completions/mean_terminated_length": 661.8148193359375,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 2.9382716049382713,
      "grad_norm": 1.1825110300486403,
      "kl": 0.25634765625,
      "learning_rate": 1.922829747055684e-07,
      "loss": 0.0264,
      "num_tokens": 27878551.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.15734902024269104,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 739.1875,
      "completions/mean_terminated_length": 673.4615478515625,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 2.941358024691358,
      "grad_norm": 0.5487079637357012,
      "kl": 0.2608642578125,
      "learning_rate": 1.9179666097728982e-07,
      "loss": 0.0251,
      "num_tokens": 27908785.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 769.3125,
      "completions/mean_terminated_length": 710.5385131835938,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 2.9444444444444446,
      "grad_norm": 1.6200495429814423,
      "kl": 0.24365234375,
      "learning_rate": 1.9131058000192726e-07,
      "loss": -0.0238,
      "num_tokens": 27940015.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.22852018475532532,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 774.6875,
      "completions/mean_terminated_length": 691.5833740234375,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 2.947530864197531,
      "grad_norm": 1.159852418700381,
      "kl": 0.2657470703125,
      "learning_rate": 1.9082473372329983e-07,
      "loss": -0.0226,
      "num_tokens": 27971413.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14842106401920319,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 797.34375,
      "completions/mean_terminated_length": 721.7916870117188,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 2.950617283950617,
      "grad_norm": 1.156321761903506,
      "kl": 0.261962890625,
      "learning_rate": 1.903391240842882e-07,
      "loss": 0.0187,
      "num_tokens": 28003748.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 720.78125,
      "completions/mean_terminated_length": 689.413818359375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 2.9537037037037037,
      "grad_norm": 1.1155823592840006,
      "kl": 0.2425537109375,
      "learning_rate": 1.8985375302682654e-07,
      "loss": 0.068,
      "num_tokens": 28033169.0,
      "reward": 0.0,
      "reward_std": 0.15829598903656006,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 723.34375,
      "completions/mean_terminated_length": 667.6666870117188,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 2.9567901234567904,
      "grad_norm": 0.9279735158564448,
      "kl": 0.3826904296875,
      "learning_rate": 1.8936862249189515e-07,
      "loss": -0.0023,
      "num_tokens": 28062844.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 692.71875,
      "completions/mean_terminated_length": 658.4483032226562,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 2.9598765432098766,
      "grad_norm": 0.05712688361471471,
      "kl": 0.302978515625,
      "learning_rate": 1.8888373441951228e-07,
      "loss": 0.0003,
      "num_tokens": 28091531.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 730.6875,
      "completions/mean_terminated_length": 648.5599975585938,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 2.962962962962963,
      "grad_norm": 0.9067233994247506,
      "kl": 0.3291015625,
      "learning_rate": 1.8839909074872675e-07,
      "loss": -0.0369,
      "num_tokens": 28121809.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.156063511967659,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 728.25,
      "completions/mean_terminated_length": 660.0,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 2.9660493827160495,
      "grad_norm": 1.4283129287704093,
      "kl": 0.3004150390625,
      "learning_rate": 1.8791469341761e-07,
      "loss": 0.0217,
      "num_tokens": 28151341.0,
      "reward": 0.0,
      "reward_std": 0.18008019030094147,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 735.21875,
      "completions/mean_terminated_length": 681.74072265625,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 2.9691358024691357,
      "grad_norm": 1.8621004104764296,
      "kl": 0.268310546875,
      "learning_rate": 1.8743054436324835e-07,
      "loss": 0.0051,
      "num_tokens": 28181448.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 854.0,
      "completions/mean_length": 675.40625,
      "completions/mean_terminated_length": 652.1666870117188,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 2.9722222222222223,
      "grad_norm": 1.1859538802593061,
      "kl": 0.2818603515625,
      "learning_rate": 1.8694664552173529e-07,
      "loss": -0.0056,
      "num_tokens": 28209257.0,
      "reward": 0.0,
      "reward_std": 0.14223133027553558,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1013.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 661.15625,
      "completions/mean_terminated_length": 661.15625,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 2.9753086419753085,
      "grad_norm": 1.361886747157153,
      "kl": 0.2763671875,
      "learning_rate": 1.8646299882816358e-07,
      "loss": 0.0308,
      "num_tokens": 28236594.0,
      "reward": 0.0,
      "reward_std": 0.18217067420482635,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 747.96875,
      "completions/mean_terminated_length": 696.8518676757812,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 2.978395061728395,
      "grad_norm": 0.008830294254685428,
      "kl": 0.25732421875,
      "learning_rate": 1.859796062166178e-07,
      "loss": 0.0003,
      "num_tokens": 28267045.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 740.84375,
      "completions/mean_terminated_length": 700.3928833007812,
      "completions/min_length": 547.0,
      "completions/min_terminated_length": 547.0,
      "epoch": 2.9814814814814814,
      "grad_norm": 0.009799257206895364,
      "kl": 0.2650146484375,
      "learning_rate": 1.854964696201666e-07,
      "loss": 0.0003,
      "num_tokens": 28297612.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 755.59375,
      "completions/mean_terminated_length": 666.125,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 2.984567901234568,
      "grad_norm": 1.1439797351882075,
      "kl": 0.2791748046875,
      "learning_rate": 1.850135909708544e-07,
      "loss": -0.0241,
      "num_tokens": 28328503.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 864.0,
      "completions/mean_length": 750.1875,
      "completions/mean_terminated_length": 643.0435180664062,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 2.9876543209876543,
      "grad_norm": 0.8828282768159587,
      "kl": 0.2965087890625,
      "learning_rate": 1.8453097219969448e-07,
      "loss": 0.0069,
      "num_tokens": 28359061.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 707.125,
      "completions/mean_terminated_length": 648.4444580078125,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 2.9907407407407405,
      "grad_norm": 0.9626889615466286,
      "kl": 0.3009033203125,
      "learning_rate": 1.8404861523666073e-07,
      "loss": -0.0041,
      "num_tokens": 28387885.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.12777692079544067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 969
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 715.71875,
      "completions/mean_terminated_length": 658.629638671875,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 2.993827160493827,
      "grad_norm": 0.7579628972107867,
      "kl": 0.2510986328125,
      "learning_rate": 1.8356652201068024e-07,
      "loss": 0.0201,
      "num_tokens": 28417352.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 737.03125,
      "completions/mean_terminated_length": 683.888916015625,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 2.996913580246914,
      "grad_norm": 0.9859395013297854,
      "kl": 0.266845703125,
      "learning_rate": 1.830846944496251e-07,
      "loss": -0.0003,
      "num_tokens": 28447729.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 971
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 684.53125,
      "completions/mean_terminated_length": 661.9000244140625,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 3.0,
      "grad_norm": 1.0103808656080384,
      "kl": 0.248291015625,
      "learning_rate": 1.826031344803053e-07,
      "loss": -0.0324,
      "num_tokens": 28476310.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 707.0625,
      "completions/mean_terminated_length": 685.933349609375,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 3.003086419753086,
      "grad_norm": 1.0766089362078926,
      "kl": 0.254150390625,
      "learning_rate": 1.8212184402846064e-07,
      "loss": -0.0173,
      "num_tokens": 28505100.0,
      "reward": 0.0,
      "reward_std": 0.1465846300125122,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 804.34375,
      "completions/mean_terminated_length": 704.5,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 3.006172839506173,
      "grad_norm": 0.8076015127764825,
      "kl": 0.2384033203125,
      "learning_rate": 1.8164082501875326e-07,
      "loss": -0.0094,
      "num_tokens": 28538199.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 815.625,
      "completions/mean_terminated_length": 706.4761962890625,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 3.009259259259259,
      "grad_norm": 0.7930158030735597,
      "kl": 0.2666015625,
      "learning_rate": 1.8116007937475947e-07,
      "loss": 0.0072,
      "num_tokens": 28570635.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 771.9375,
      "completions/mean_terminated_length": 725.25927734375,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 3.0123456790123457,
      "grad_norm": 0.9088831310724863,
      "kl": 0.2623291015625,
      "learning_rate": 1.8067960901896278e-07,
      "loss": -0.0142,
      "num_tokens": 28602277.0,
      "reward": 0.0,
      "reward_std": 0.15838100016117096,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 763.6875,
      "completions/mean_terminated_length": 736.7586059570312,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 3.015432098765432,
      "grad_norm": 0.7898439743118666,
      "kl": 0.2899169921875,
      "learning_rate": 1.8019941587274565e-07,
      "loss": 0.002,
      "num_tokens": 28633331.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 685.71875,
      "completions/mean_terminated_length": 637.3928833007812,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 3.0185185185185186,
      "grad_norm": 1.1971246125556536,
      "kl": 0.3021240234375,
      "learning_rate": 1.7971950185638195e-07,
      "loss": -0.0207,
      "num_tokens": 28661834.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 767.90625,
      "completions/mean_terminated_length": 667.6956787109375,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 3.021604938271605,
      "grad_norm": 0.87021317675508,
      "kl": 0.2828369140625,
      "learning_rate": 1.7923986888902948e-07,
      "loss": -0.0007,
      "num_tokens": 28693275.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 756.78125,
      "completions/mean_terminated_length": 681.9599609375,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 3.0246913580246915,
      "grad_norm": 0.6666076971217348,
      "kl": 0.2564697265625,
      "learning_rate": 1.78760518888722e-07,
      "loss": 0.0,
      "num_tokens": 28724056.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 759.34375,
      "completions/mean_terminated_length": 671.125,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 3.0277777777777777,
      "grad_norm": 0.922703690793308,
      "kl": 0.2677001953125,
      "learning_rate": 1.782814537723617e-07,
      "loss": -0.0329,
      "num_tokens": 28755035.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15909674763679504,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 751.21875,
      "completions/mean_terminated_length": 644.478271484375,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 3.0308641975308643,
      "grad_norm": 0.90574835232403,
      "kl": 0.3001708984375,
      "learning_rate": 1.7780267545571175e-07,
      "loss": -0.0318,
      "num_tokens": 28785762.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 681.28125,
      "completions/mean_terminated_length": 632.3214721679688,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 3.0339506172839505,
      "grad_norm": 1.2363156639509054,
      "kl": 0.2816162109375,
      "learning_rate": 1.7732418585338804e-07,
      "loss": -0.0402,
      "num_tokens": 28814039.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.2396731674671173,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 747.96875,
      "completions/mean_terminated_length": 696.8518676757812,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 3.037037037037037,
      "grad_norm": 1.0725872098852478,
      "kl": 0.249755859375,
      "learning_rate": 1.7684598687885216e-07,
      "loss": 0.0058,
      "num_tokens": 28844458.0,
      "reward": 0.0,
      "reward_std": 0.1590951681137085,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 760.75,
      "completions/mean_terminated_length": 712.0,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 3.0401234567901234,
      "grad_norm": 1.5867530014474474,
      "kl": 0.236328125,
      "learning_rate": 1.7636808044440344e-07,
      "loss": 0.0638,
      "num_tokens": 28875774.0,
      "reward": -8.847564458847046e-09,
      "reward_std": 0.1862604022026062,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -2.7939677238464355e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 739.5,
      "completions/mean_terminated_length": 673.84619140625,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 3.04320987654321,
      "grad_norm": 0.9447105016801203,
      "kl": 0.291259765625,
      "learning_rate": 1.7589046846117132e-07,
      "loss": -0.0011,
      "num_tokens": 28906302.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15887358784675598,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 771.25,
      "completions/mean_terminated_length": 700.47998046875,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.0462962962962963,
      "grad_norm": 1.5922069554784521,
      "kl": 0.2728271484375,
      "learning_rate": 1.754131528391078e-07,
      "loss": 0.0127,
      "num_tokens": 28938262.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.25036370754241943,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 745.53125,
      "completions/mean_terminated_length": 652.7083740234375,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 3.049382716049383,
      "grad_norm": 0.8559104930401311,
      "kl": 0.2789306640625,
      "learning_rate": 1.7493613548697966e-07,
      "loss": -0.0055,
      "num_tokens": 28968691.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 770.84375,
      "completions/mean_terminated_length": 712.423095703125,
      "completions/min_length": 316.0,
      "completions/min_terminated_length": 316.0,
      "epoch": 3.052469135802469,
      "grad_norm": 1.2016564202779059,
      "kl": 0.2301025390625,
      "learning_rate": 1.744594183123611e-07,
      "loss": -0.0666,
      "num_tokens": 29000138.0,
      "reward": 0.0,
      "reward_std": 0.25610116124153137,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 668.375,
      "completions/mean_terminated_length": 668.375,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 3.0555555555555554,
      "grad_norm": 1.2034066140857234,
      "kl": 0.276611328125,
      "learning_rate": 1.7398300322162563e-07,
      "loss": 0.0164,
      "num_tokens": 29028274.0,
      "reward": 0.0,
      "reward_std": 0.1694464087486267,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 778.1875,
      "completions/mean_terminated_length": 666.45458984375,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 3.058641975308642,
      "grad_norm": 1.3762028459638855,
      "kl": 0.263671875,
      "learning_rate": 1.7350689211993902e-07,
      "loss": 0.0073,
      "num_tokens": 29060096.0,
      "reward": 0.0,
      "reward_std": 0.18888147175312042,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 685.09375,
      "completions/mean_terminated_length": 650.0344848632812,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 3.0617283950617282,
      "grad_norm": 1.2450297203744591,
      "kl": 0.240234375,
      "learning_rate": 1.7303108691125107e-07,
      "loss": -0.0955,
      "num_tokens": 29088315.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.18684130907058716,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 720.5,
      "completions/mean_terminated_length": 664.2963256835938,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 3.064814814814815,
      "grad_norm": 0.5146764469695151,
      "kl": 0.2337646484375,
      "learning_rate": 1.725555894982887e-07,
      "loss": 0.0184,
      "num_tokens": 29118059.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 738.625,
      "completions/mean_terminated_length": 685.7777709960938,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 3.067901234567901,
      "grad_norm": 4.2349552915993485,
      "kl": 0.2689208984375,
      "learning_rate": 1.7208040178254768e-07,
      "loss": -0.1757,
      "num_tokens": 29148259.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19500862061977386,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 663.65625,
      "completions/mean_terminated_length": 652.0322265625,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 3.0709876543209877,
      "grad_norm": 0.6132065135896924,
      "kl": 0.271728515625,
      "learning_rate": 1.716055256642855e-07,
      "loss": -0.0186,
      "num_tokens": 29176424.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 676.09375,
      "completions/mean_terminated_length": 640.1034545898438,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 3.074074074074074,
      "grad_norm": 2.8841175768517346,
      "kl": 0.2662353515625,
      "learning_rate": 1.711309630425135e-07,
      "loss": -0.2248,
      "num_tokens": 29204171.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15646925568580627,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 700.375,
      "completions/mean_terminated_length": 666.8965454101562,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 3.0771604938271606,
      "grad_norm": 0.9452856128272616,
      "kl": 0.2845458984375,
      "learning_rate": 1.7065671581498936e-07,
      "loss": -0.0021,
      "num_tokens": 29232831.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 714.59375,
      "completions/mean_terminated_length": 657.2963256835938,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 3.080246913580247,
      "grad_norm": 0.8216890799478307,
      "kl": 0.251953125,
      "learning_rate": 1.701827858782095e-07,
      "loss": 0.0576,
      "num_tokens": 29261990.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 681.90625,
      "completions/mean_terminated_length": 659.1000366210938,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 3.0833333333333335,
      "grad_norm": 0.4945412077311694,
      "kl": 0.254638671875,
      "learning_rate": 1.697091751274016e-07,
      "loss": 0.0264,
      "num_tokens": 29290151.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 725.59375,
      "completions/mean_terminated_length": 682.9642944335938,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 3.0864197530864197,
      "grad_norm": 1.3606006021037704,
      "kl": 0.23291015625,
      "learning_rate": 1.6923588545651672e-07,
      "loss": 0.0241,
      "num_tokens": 29320146.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.14860975742340088,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 694.65625,
      "completions/mean_terminated_length": 647.607177734375,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 3.0895061728395063,
      "grad_norm": 1.290694181029074,
      "kl": 0.2579345703125,
      "learning_rate": 1.687629187582221e-07,
      "loss": -0.043,
      "num_tokens": 29348679.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.21570174396038055,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 704.625,
      "completions/mean_terminated_length": 645.4815063476562,
      "completions/min_length": 316.0,
      "completions/min_terminated_length": 316.0,
      "epoch": 3.0925925925925926,
      "grad_norm": 1.30463801788802,
      "kl": 0.2574462890625,
      "learning_rate": 1.6829027692389343e-07,
      "loss": 0.1353,
      "num_tokens": 29377927.0,
      "reward": 0.0,
      "reward_std": 0.13204941153526306,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 718.0625,
      "completions/mean_terminated_length": 674.357177734375,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 3.095679012345679,
      "grad_norm": 1.762064383114544,
      "kl": 0.251708984375,
      "learning_rate": 1.678179618436073e-07,
      "loss": -0.1035,
      "num_tokens": 29407613.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.2035190612077713,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 763.21875,
      "completions/mean_terminated_length": 676.2916870117188,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 3.0987654320987654,
      "grad_norm": 1.394911026973124,
      "kl": 0.256103515625,
      "learning_rate": 1.6734597540613344e-07,
      "loss": -0.005,
      "num_tokens": 29438844.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18761375546455383,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 723.34375,
      "completions/mean_terminated_length": 639.1599731445312,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 3.1018518518518516,
      "grad_norm": 1.8197181485250147,
      "kl": 0.3023681640625,
      "learning_rate": 1.6687431949892753e-07,
      "loss": 0.0472,
      "num_tokens": 29468407.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 667.96875,
      "completions/mean_terminated_length": 644.2333374023438,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 3.1049382716049383,
      "grad_norm": 1.3329960123541742,
      "kl": 0.281494140625,
      "learning_rate": 1.664029960081234e-07,
      "loss": -0.028,
      "num_tokens": 29496134.0,
      "reward": 0.0,
      "reward_std": 0.19164466857910156,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 699.125,
      "completions/mean_terminated_length": 638.9629516601562,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 3.1080246913580245,
      "grad_norm": 0.008444538809175298,
      "kl": 0.248779296875,
      "learning_rate": 1.6593200681852574e-07,
      "loss": 0.0002,
      "num_tokens": 29524710.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 734.9375,
      "completions/mean_terminated_length": 693.6428833007812,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 3.111111111111111,
      "grad_norm": 1.2221476308156387,
      "kl": 0.24755859375,
      "learning_rate": 1.6546135381360194e-07,
      "loss": -0.0128,
      "num_tokens": 29554292.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 778.40625,
      "completions/mean_terminated_length": 709.6400146484375,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 3.1141975308641974,
      "grad_norm": 1.0889000163622438,
      "kl": 0.2532958984375,
      "learning_rate": 1.6499103887547544e-07,
      "loss": 0.0024,
      "num_tokens": 29585645.0,
      "reward": 0.0,
      "reward_std": 0.1917317807674408,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1009
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 705.53125,
      "completions/mean_terminated_length": 672.586181640625,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 3.117283950617284,
      "grad_norm": 0.5614509907699836,
      "kl": 0.2498779296875,
      "learning_rate": 1.6452106388491762e-07,
      "loss": 0.0144,
      "num_tokens": 29614442.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 751.28125,
      "completions/mean_terminated_length": 688.34619140625,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 3.1203703703703702,
      "grad_norm": 0.6057075822312525,
      "kl": 0.2852783203125,
      "learning_rate": 1.6405143072134031e-07,
      "loss": 0.0124,
      "num_tokens": 29644767.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1011
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 764.0625,
      "completions/mean_terminated_length": 726.9285888671875,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 3.123456790123457,
      "grad_norm": 0.46899477512584786,
      "kl": 0.271240234375,
      "learning_rate": 1.6358214126278855e-07,
      "loss": 0.0027,
      "num_tokens": 29675629.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 778.25,
      "completions/mean_terminated_length": 682.0869750976562,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 3.126543209876543,
      "grad_norm": 1.1254396274309257,
      "kl": 0.2421875,
      "learning_rate": 1.6311319738593281e-07,
      "loss": -0.035,
      "num_tokens": 29707209.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14842106401920319,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 779.9375,
      "completions/mean_terminated_length": 698.5833740234375,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 3.1296296296296298,
      "grad_norm": 1.003660901667382,
      "kl": 0.24365234375,
      "learning_rate": 1.6264460096606169e-07,
      "loss": -0.0082,
      "num_tokens": 29738651.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 763.875,
      "completions/mean_terminated_length": 715.7037353515625,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 3.132716049382716,
      "grad_norm": 1.328120572842891,
      "kl": 0.2237548828125,
      "learning_rate": 1.621763538770743e-07,
      "loss": -0.0924,
      "num_tokens": 29769803.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.20375239849090576,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 770.71875,
      "completions/mean_terminated_length": 723.8148193359375,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.1358024691358026,
      "grad_norm": 1.0646566319626596,
      "kl": 0.278564453125,
      "learning_rate": 1.6170845799147266e-07,
      "loss": 0.0081,
      "num_tokens": 29800950.0,
      "reward": 0.0,
      "reward_std": 0.13428640365600586,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 735.21875,
      "completions/mean_terminated_length": 693.9642944335938,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 3.138888888888889,
      "grad_norm": 0.9780147807978669,
      "kl": 0.2899169921875,
      "learning_rate": 1.6124091518035443e-07,
      "loss": -0.0277,
      "num_tokens": 29830617.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 688.4375,
      "completions/mean_terminated_length": 653.72412109375,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 3.1419753086419755,
      "grad_norm": 0.023944795044643093,
      "kl": 0.2646484375,
      "learning_rate": 1.607737273134054e-07,
      "loss": 0.0003,
      "num_tokens": 29858863.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 757.75,
      "completions/mean_terminated_length": 696.3077392578125,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 3.1450617283950617,
      "grad_norm": 1.3792267988657705,
      "kl": 0.251708984375,
      "learning_rate": 1.603068962588918e-07,
      "loss": -0.0015,
      "num_tokens": 29889463.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.22755002975463867,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 799.0,
      "completions/mean_terminated_length": 724.0,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 3.148148148148148,
      "grad_norm": 0.8595002612418882,
      "kl": 0.28173828125,
      "learning_rate": 1.598404238836532e-07,
      "loss": -0.0384,
      "num_tokens": 29921719.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 788.25,
      "completions/mean_terminated_length": 733.84619140625,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 3.1512345679012346,
      "grad_norm": 1.1107760097980046,
      "kl": 0.221923828125,
      "learning_rate": 1.5937431205309465e-07,
      "loss": -0.045,
      "num_tokens": 29953707.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 742.46875,
      "completions/mean_terminated_length": 702.2500610351562,
      "completions/min_length": 589.0,
      "completions/min_terminated_length": 589.0,
      "epoch": 3.154320987654321,
      "grad_norm": 0.793439979172092,
      "kl": 0.2550048828125,
      "learning_rate": 1.589085626311795e-07,
      "loss": -0.0012,
      "num_tokens": 29983438.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1385486125946045,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1022
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 690.375,
      "completions/mean_terminated_length": 628.5925903320312,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 3.1574074074074074,
      "grad_norm": 1.3473054361293963,
      "kl": 0.27783203125,
      "learning_rate": 1.5844317748042167e-07,
      "loss": -0.0377,
      "num_tokens": 30012114.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 745.78125,
      "completions/mean_terminated_length": 717.0,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.1604938271604937,
      "grad_norm": 1.5521047207589356,
      "kl": 0.236572265625,
      "learning_rate": 1.5797815846187868e-07,
      "loss": -0.0563,
      "num_tokens": 30042235.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 757.125,
      "completions/mean_terminated_length": 707.7037353515625,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.1635802469135803,
      "grad_norm": 0.5039402374992891,
      "kl": 0.2437744140625,
      "learning_rate": 1.575135074351435e-07,
      "loss": 0.0066,
      "num_tokens": 30073135.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 744.78125,
      "completions/mean_terminated_length": 693.0740966796875,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 3.1666666666666665,
      "grad_norm": 1.268203203867581,
      "kl": 0.281005859375,
      "learning_rate": 1.5704922625833784e-07,
      "loss": -0.0174,
      "num_tokens": 30103584.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 668.625,
      "completions/mean_terminated_length": 644.933349609375,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.169753086419753,
      "grad_norm": 1.0867712934423577,
      "kl": 0.2960205078125,
      "learning_rate": 1.565853167881042e-07,
      "loss": 0.0045,
      "num_tokens": 30131120.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 759.59375,
      "completions/mean_terminated_length": 698.5769653320312,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 3.1728395061728394,
      "grad_norm": 1.0793686524716646,
      "kl": 0.2525634765625,
      "learning_rate": 1.5612178087959887e-07,
      "loss": 0.0003,
      "num_tokens": 30161587.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 743.65625,
      "completions/mean_terminated_length": 703.607177734375,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 3.175925925925926,
      "grad_norm": 1.1451486257740306,
      "kl": 0.2615966796875,
      "learning_rate": 1.556586203864841e-07,
      "loss": 0.0223,
      "num_tokens": 30191852.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 689.46875,
      "completions/mean_terminated_length": 627.5184936523438,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 3.1790123456790123,
      "grad_norm": 0.009643798735085194,
      "kl": 0.255615234375,
      "learning_rate": 1.5519583716092077e-07,
      "loss": 0.0003,
      "num_tokens": 30220263.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 764.40625,
      "completions/mean_terminated_length": 677.875,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 3.182098765432099,
      "grad_norm": 1.4809011453828196,
      "kl": 0.2730712890625,
      "learning_rate": 1.5473343305356136e-07,
      "loss": 0.0179,
      "num_tokens": 30251648.0,
      "reward": 0.0,
      "reward_std": 0.19228190183639526,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 726.90625,
      "completions/mean_terminated_length": 643.719970703125,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 3.185185185185185,
      "grad_norm": 2.3859976342186706,
      "kl": 0.2730712890625,
      "learning_rate": 1.5427140991354215e-07,
      "loss": 0.0577,
      "num_tokens": 30281337.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.23513615131378174,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 707.78125,
      "completions/mean_terminated_length": 686.7000122070312,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 3.1882716049382718,
      "grad_norm": 2.2456635856547305,
      "kl": 0.28466796875,
      "learning_rate": 1.5380976958847572e-07,
      "loss": -0.2009,
      "num_tokens": 30310466.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 730.6875,
      "completions/mean_terminated_length": 688.7857666015625,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.191358024691358,
      "grad_norm": 0.9617662099963058,
      "kl": 0.2615966796875,
      "learning_rate": 1.5334851392444412e-07,
      "loss": 0.0187,
      "num_tokens": 30340316.0,
      "reward": 0.0,
      "reward_std": 0.15636713802814484,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 761.5625,
      "completions/mean_terminated_length": 674.0833740234375,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 3.1944444444444446,
      "grad_norm": 1.057851202950758,
      "kl": 0.2554931640625,
      "learning_rate": 1.5288764476599102e-07,
      "loss": 0.0037,
      "num_tokens": 30370982.0,
      "reward": 0.0,
      "reward_std": 0.15223759412765503,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 965.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 646.375,
      "completions/mean_terminated_length": 646.375,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 3.197530864197531,
      "grad_norm": 1.311554180537854,
      "kl": 0.250244140625,
      "learning_rate": 1.524271639561145e-07,
      "loss": 0.0346,
      "num_tokens": 30397526.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.13557207584381104,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 691.1875,
      "completions/mean_terminated_length": 656.7586059570312,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 3.200617283950617,
      "grad_norm": 1.0566047366456837,
      "kl": 0.2537841796875,
      "learning_rate": 1.5196707333625959e-07,
      "loss": 0.0182,
      "num_tokens": 30425804.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 734.28125,
      "completions/mean_terminated_length": 653.1599731445312,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 3.2037037037037037,
      "grad_norm": 2.6196208044284663,
      "kl": 0.2647705078125,
      "learning_rate": 1.5150737474631092e-07,
      "loss": -0.2351,
      "num_tokens": 30455721.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 767.53125,
      "completions/mean_terminated_length": 708.34619140625,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 3.20679012345679,
      "grad_norm": 0.664531621539753,
      "kl": 0.25732421875,
      "learning_rate": 1.5104807002458564e-07,
      "loss": -0.008,
      "num_tokens": 30487166.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 793.78125,
      "completions/mean_terminated_length": 729.3200073242188,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 3.2098765432098766,
      "grad_norm": 0.0431546602367175,
      "kl": 0.2581787109375,
      "learning_rate": 1.5058916100782555e-07,
      "loss": 0.0003,
      "num_tokens": 30519015.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 789.6875,
      "completions/mean_terminated_length": 735.6154174804688,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 3.212962962962963,
      "grad_norm": 0.8377817886949486,
      "kl": 0.25732421875,
      "learning_rate": 1.5013064953119036e-07,
      "loss": -0.0042,
      "num_tokens": 30550509.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 747.375,
      "completions/mean_terminated_length": 655.1666870117188,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 3.2160493827160495,
      "grad_norm": 2.215482458994797,
      "kl": 0.254638671875,
      "learning_rate": 1.4967253742824962e-07,
      "loss": -0.1136,
      "num_tokens": 30581609.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.18394683301448822,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 747.625,
      "completions/mean_terminated_length": 683.84619140625,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 3.2191358024691357,
      "grad_norm": 0.9499102220498177,
      "kl": 0.2445068359375,
      "learning_rate": 1.4921482653097614e-07,
      "loss": -0.0531,
      "num_tokens": 30612329.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 768.375,
      "completions/mean_terminated_length": 709.3846435546875,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 3.2222222222222223,
      "grad_norm": 0.5653222476164992,
      "kl": 0.260498046875,
      "learning_rate": 1.487575186697381e-07,
      "loss": -0.0054,
      "num_tokens": 30643305.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 732.28125,
      "completions/mean_terminated_length": 678.25927734375,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 3.2253086419753085,
      "grad_norm": 1.9291025282518113,
      "kl": 0.2587890625,
      "learning_rate": 1.4830061567329223e-07,
      "loss": 0.0277,
      "num_tokens": 30673390.0,
      "reward": 0.0,
      "reward_std": 0.25847262144088745,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 776.1875,
      "completions/mean_terminated_length": 646.3809814453125,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 3.228395061728395,
      "grad_norm": 0.027935652357127107,
      "kl": 0.271484375,
      "learning_rate": 1.4784411936877596e-07,
      "loss": 0.0003,
      "num_tokens": 30704996.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 695.0,
      "completions/mean_terminated_length": 634.0740966796875,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 3.2314814814814814,
      "grad_norm": 1.0983382773162031,
      "kl": 0.272705078125,
      "learning_rate": 1.4738803158170043e-07,
      "loss": 0.0155,
      "num_tokens": 30733664.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.1455036699771881,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 732.1875,
      "completions/mean_terminated_length": 712.7333984375,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 3.234567901234568,
      "grad_norm": 1.0642464302132313,
      "kl": 0.24951171875,
      "learning_rate": 1.469323541359433e-07,
      "loss": 0.0133,
      "num_tokens": 30763654.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.15508639812469482,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 744.46875,
      "completions/mean_terminated_length": 704.5357666015625,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.2376543209876543,
      "grad_norm": 0.879023680963078,
      "kl": 0.2646484375,
      "learning_rate": 1.4647708885374105e-07,
      "loss": 0.0084,
      "num_tokens": 30793989.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.08606424182653427,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 722.90625,
      "completions/mean_terminated_length": 691.7586059570312,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 3.240740740740741,
      "grad_norm": 0.8895204420052543,
      "kl": 0.2540283203125,
      "learning_rate": 1.4602223755568212e-07,
      "loss": 0.0001,
      "num_tokens": 30823990.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 753.15625,
      "completions/mean_terminated_length": 690.6538696289062,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 3.243827160493827,
      "grad_norm": 0.9153946029916924,
      "kl": 0.228515625,
      "learning_rate": 1.4556780206069925e-07,
      "loss": 0.0087,
      "num_tokens": 30855143.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 729.4375,
      "completions/mean_terminated_length": 661.4615478515625,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 3.246913580246914,
      "grad_norm": 1.1455629101525655,
      "kl": 0.2523193359375,
      "learning_rate": 1.4511378418606272e-07,
      "loss": -0.0103,
      "num_tokens": 30884541.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 757.21875,
      "completions/mean_terminated_length": 695.6538696289062,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 3.25,
      "grad_norm": 0.7353234991734671,
      "kl": 0.251220703125,
      "learning_rate": 1.4466018574737236e-07,
      "loss": 0.0218,
      "num_tokens": 30915308.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 787.3125,
      "completions/mean_terminated_length": 753.5000610351562,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 3.253086419753086,
      "grad_norm": 0.4477869406085615,
      "kl": 0.245849609375,
      "learning_rate": 1.4420700855855093e-07,
      "loss": -0.0192,
      "num_tokens": 30947126.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 757.90625,
      "completions/mean_terminated_length": 696.5,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 3.256172839506173,
      "grad_norm": 0.8646740435234543,
      "kl": 0.23583984375,
      "learning_rate": 1.4375425443183675e-07,
      "loss": 0.0188,
      "num_tokens": 30977967.0,
      "reward": 0.0,
      "reward_std": 0.13030678033828735,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 782.125,
      "completions/mean_terminated_length": 726.3077392578125,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 3.259259259259259,
      "grad_norm": 1.1082173590712923,
      "kl": 0.3033447265625,
      "learning_rate": 1.43301925177776e-07,
      "loss": -0.0765,
      "num_tokens": 31009255.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 733.75,
      "completions/mean_terminated_length": 637.0,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 3.2623456790123457,
      "grad_norm": 0.010185563589904368,
      "kl": 0.2779541015625,
      "learning_rate": 1.4285002260521617e-07,
      "loss": 0.0003,
      "num_tokens": 31039171.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 862.0,
      "completions/mean_length": 753.09375,
      "completions/mean_terminated_length": 647.0869750976562,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 3.265432098765432,
      "grad_norm": 0.6807906915146793,
      "kl": 0.336669921875,
      "learning_rate": 1.4239854852129807e-07,
      "loss": -0.0032,
      "num_tokens": 31069702.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 767.71875,
      "completions/mean_terminated_length": 682.2916870117188,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 3.2685185185185186,
      "grad_norm": 1.1954869638403682,
      "kl": 0.26123046875,
      "learning_rate": 1.419475047314493e-07,
      "loss": -0.0103,
      "num_tokens": 31100769.0,
      "reward": 0.0,
      "reward_std": 0.18885569274425507,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1059
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 728.09375,
      "completions/mean_terminated_length": 708.36669921875,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.271604938271605,
      "grad_norm": 2.2188825714435043,
      "kl": 0.266357421875,
      "learning_rate": 1.4149689303937662e-07,
      "loss": -0.2275,
      "num_tokens": 31130304.0,
      "reward": 0.0,
      "reward_std": 0.15134452283382416,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 722.875,
      "completions/mean_terminated_length": 679.857177734375,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 3.2746913580246915,
      "grad_norm": 0.981641484247448,
      "kl": 0.22900390625,
      "learning_rate": 1.4104671524705892e-07,
      "loss": -0.0627,
      "num_tokens": 31159812.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14842106401920319,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 750.34375,
      "completions/mean_terminated_length": 687.1923217773438,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 3.2777777777777777,
      "grad_norm": 0.7865097763068806,
      "kl": 0.2552490234375,
      "learning_rate": 1.4059697315473988e-07,
      "loss": 0.0411,
      "num_tokens": 31190075.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.08606424182653427,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 725.53125,
      "completions/mean_terminated_length": 670.25927734375,
      "completions/min_length": 340.0,
      "completions/min_terminated_length": 340.0,
      "epoch": 3.2808641975308643,
      "grad_norm": 1.5871852156633488,
      "kl": 0.2781982421875,
      "learning_rate": 1.4014766856092081e-07,
      "loss": -0.0573,
      "num_tokens": 31219832.0,
      "reward": -2.561137080192566e-09,
      "reward_std": 0.18082474172115326,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 773.90625,
      "completions/mean_terminated_length": 703.8800048828125,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 3.2839506172839505,
      "grad_norm": 0.7425942490096938,
      "kl": 0.24951171875,
      "learning_rate": 1.3969880326235362e-07,
      "loss": -0.0105,
      "num_tokens": 31251409.0,
      "reward": 0.0,
      "reward_std": 0.12664207816123962,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 763.59375,
      "completions/mean_terminated_length": 726.3928833007812,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 3.287037037037037,
      "grad_norm": 1.2067555235365977,
      "kl": 0.2589111328125,
      "learning_rate": 1.3925037905403324e-07,
      "loss": -0.0674,
      "num_tokens": 31282240.0,
      "reward": 0.0,
      "reward_std": 0.1882191002368927,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 795.65625,
      "completions/mean_terminated_length": 706.3043823242188,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 3.2901234567901234,
      "grad_norm": 1.299680625758312,
      "kl": 0.2777099609375,
      "learning_rate": 1.38802397729191e-07,
      "loss": 0.0594,
      "num_tokens": 31314637.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 750.5625,
      "completions/mean_terminated_length": 711.5000610351562,
      "completions/min_length": 538.0,
      "completions/min_terminated_length": 538.0,
      "epoch": 3.29320987654321,
      "grad_norm": 1.2129723717473224,
      "kl": 0.2320556640625,
      "learning_rate": 1.3835486107928678e-07,
      "loss": -0.0404,
      "num_tokens": 31345275.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 726.25,
      "completions/mean_terminated_length": 695.4483032226562,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 3.2962962962962963,
      "grad_norm": 0.6936382173321262,
      "kl": 0.268798828125,
      "learning_rate": 1.3790777089400262e-07,
      "loss": -0.0224,
      "num_tokens": 31374659.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 754.40625,
      "completions/mean_terminated_length": 678.9199829101562,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 3.299382716049383,
      "grad_norm": 1.4164979976340009,
      "kl": 0.2735595703125,
      "learning_rate": 1.3746112896123494e-07,
      "loss": -0.0427,
      "num_tokens": 31405896.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.24048790335655212,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1069
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 811.53125,
      "completions/mean_terminated_length": 781.1785888671875,
      "completions/min_length": 558.0,
      "completions/min_terminated_length": 558.0,
      "epoch": 3.302469135802469,
      "grad_norm": 0.011120476702207317,
      "kl": 0.247314453125,
      "learning_rate": 1.3701493706708768e-07,
      "loss": 0.0002,
      "num_tokens": 31439049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 767.59375,
      "completions/mean_terminated_length": 741.0689697265625,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 3.3055555555555554,
      "grad_norm": 0.483959515195949,
      "kl": 0.2911376953125,
      "learning_rate": 1.3656919699586503e-07,
      "loss": 0.0124,
      "num_tokens": 31469976.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1071
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 748.65625,
      "completions/mean_terminated_length": 685.1154174804688,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 3.308641975308642,
      "grad_norm": 0.7348895873578779,
      "kl": 0.248779296875,
      "learning_rate": 1.3612391053006446e-07,
      "loss": -0.042,
      "num_tokens": 31500493.0,
      "reward": 0.05624999850988388,
      "reward_std": 0.06495190411806107,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0625,
      "rewards/logprob_reward/std": 0.24593468010425568,
      "step": 1072
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 722.125,
      "completions/mean_terminated_length": 679.0,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 3.3117283950617282,
      "grad_norm": 1.1887001755579771,
      "kl": NaN,
      "learning_rate": 1.356790794503694e-07,
      "loss": -0.0204,
      "num_tokens": 31530165.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 825.3125,
      "completions/mean_terminated_length": 779.4615478515625,
      "completions/min_length": 532.0,
      "completions/min_terminated_length": 532.0,
      "epoch": 3.314814814814815,
      "grad_norm": 0.012406080713913019,
      "kl": 0.2333984375,
      "learning_rate": 1.3523470553564238e-07,
      "loss": 0.0002,
      "num_tokens": 31563507.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 716.78125,
      "completions/mean_terminated_length": 672.8928833007812,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 3.317901234567901,
      "grad_norm": 0.5743044774276269,
      "kl": 0.2496337890625,
      "learning_rate": 1.3479079056291738e-07,
      "loss": 0.0087,
      "num_tokens": 31592920.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 788.0625,
      "completions/mean_terminated_length": 709.4166870117188,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 3.3209876543209877,
      "grad_norm": 3.6920513030189674,
      "kl": 0.2532958984375,
      "learning_rate": 1.3434733630739345e-07,
      "loss": -0.143,
      "num_tokens": 31625150.0,
      "reward": 0.0,
      "reward_std": 0.20147258043289185,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 729.25,
      "completions/mean_terminated_length": 687.1428833007812,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 3.324074074074074,
      "grad_norm": 0.7065504635632736,
      "kl": 0.25,
      "learning_rate": 1.3390434454242704e-07,
      "loss": 0.0049,
      "num_tokens": 31654834.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 746.3125,
      "completions/mean_terminated_length": 717.586181640625,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 3.3271604938271606,
      "grad_norm": 0.8346510215181216,
      "kl": 0.2330322265625,
      "learning_rate": 1.334618170395254e-07,
      "loss": -0.0133,
      "num_tokens": 31684944.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 707.4375,
      "completions/mean_terminated_length": 697.2257690429688,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 3.330246913580247,
      "grad_norm": 1.4942222054853591,
      "kl": 0.248291015625,
      "learning_rate": 1.3301975556833872e-07,
      "loss": -0.0873,
      "num_tokens": 31713582.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19293318688869476,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1079
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 822.84375,
      "completions/mean_terminated_length": 731.4091186523438,
      "completions/min_length": 511.0,
      "completions/min_terminated_length": 511.0,
      "epoch": 3.3333333333333335,
      "grad_norm": 1.0428453264332782,
      "kl": 0.2427978515625,
      "learning_rate": 1.3257816189665398e-07,
      "loss": -0.0245,
      "num_tokens": 31746273.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 821.84375,
      "completions/mean_terminated_length": 754.4583740234375,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 3.3364197530864197,
      "grad_norm": 1.3184160835714214,
      "kl": 0.22412109375,
      "learning_rate": 1.3213703779038726e-07,
      "loss": 0.0191,
      "num_tokens": 31779844.0,
      "reward": 0.0,
      "reward_std": 0.15904246270656586,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 742.28125,
      "completions/mean_terminated_length": 702.0357666015625,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 3.3395061728395063,
      "grad_norm": 0.7748820346284045,
      "kl": 0.269287109375,
      "learning_rate": 1.3169638501357697e-07,
      "loss": 0.0289,
      "num_tokens": 31809781.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1082
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 763.96875,
      "completions/mean_terminated_length": 703.9615478515625,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 3.3425925925925926,
      "grad_norm": 1.555543362600614,
      "kl": 0.2437744140625,
      "learning_rate": 1.3125620532837667e-07,
      "loss": 0.0216,
      "num_tokens": 31840768.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2136804610490799,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 729.65625,
      "completions/mean_terminated_length": 675.1481323242188,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 3.3456790123456788,
      "grad_norm": 0.9813131603858801,
      "kl": 0.2388916015625,
      "learning_rate": 1.3081650049504784e-07,
      "loss": -0.0027,
      "num_tokens": 31869985.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 729.71875,
      "completions/mean_terminated_length": 647.3200073242188,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 3.3487654320987654,
      "grad_norm": 1.2093380139341139,
      "kl": 0.2509765625,
      "learning_rate": 1.3037727227195333e-07,
      "loss": -0.0696,
      "num_tokens": 31899804.0,
      "reward": 0.0,
      "reward_std": 0.15134452283382416,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 786.1875,
      "completions/mean_terminated_length": 752.2142944335938,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 3.351851851851852,
      "grad_norm": 0.6034368765472946,
      "kl": 0.259033203125,
      "learning_rate": 1.2993852241554986e-07,
      "loss": -0.0151,
      "num_tokens": 31931686.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1086
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 701.8125,
      "completions/mean_terminated_length": 655.7857666015625,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 3.3549382716049383,
      "grad_norm": 1.227791919392234,
      "kl": 0.24609375,
      "learning_rate": 1.295002526803813e-07,
      "loss": -0.1123,
      "num_tokens": 31960116.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14231424033641815,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 793.90625,
      "completions/mean_terminated_length": 703.8695678710938,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 3.3580246913580245,
      "grad_norm": 1.1499498151806773,
      "kl": 0.2586669921875,
      "learning_rate": 1.2906246481907145e-07,
      "loss": -0.0692,
      "num_tokens": 31992133.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13885828852653503,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 754.0,
      "completions/mean_terminated_length": 678.3999633789062,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 3.361111111111111,
      "grad_norm": 0.4967767734834555,
      "kl": 0.25,
      "learning_rate": 1.2862516058231718e-07,
      "loss": 0.0055,
      "num_tokens": 32022453.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 709.625,
      "completions/mean_terminated_length": 664.7142944335938,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 3.3641975308641974,
      "grad_norm": 0.5488839834782302,
      "kl": 0.2513427734375,
      "learning_rate": 1.2818834171888136e-07,
      "loss": 0.0125,
      "num_tokens": 32051645.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 783.53125,
      "completions/mean_terminated_length": 728.0385131835938,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 3.367283950617284,
      "grad_norm": 1.1988321545343161,
      "kl": 0.283935546875,
      "learning_rate": 1.277520099755857e-07,
      "loss": 0.0258,
      "num_tokens": 32083502.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 753.46875,
      "completions/mean_terminated_length": 714.8214721679688,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 3.3703703703703702,
      "grad_norm": 0.7860866149746452,
      "kl": 0.241943359375,
      "learning_rate": 1.2731616709730428e-07,
      "loss": 0.0138,
      "num_tokens": 32114093.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 755.46875,
      "completions/mean_terminated_length": 693.5,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 3.373456790123457,
      "grad_norm": 1.9355067929149514,
      "kl": 0.2366943359375,
      "learning_rate": 1.2688081482695577e-07,
      "loss": 0.1288,
      "num_tokens": 32145040.0,
      "reward": 0.0,
      "reward_std": 0.15597835183143616,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 721.9375,
      "completions/mean_terminated_length": 701.800048828125,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 3.376543209876543,
      "grad_norm": 0.4790221526932161,
      "kl": 0.242431640625,
      "learning_rate": 1.264459549054973e-07,
      "loss": -0.018,
      "num_tokens": 32174690.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 801.25,
      "completions/mean_terminated_length": 727.0,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 3.3796296296296298,
      "grad_norm": 0.7093664604532016,
      "kl": 0.2525634765625,
      "learning_rate": 1.2601158907191696e-07,
      "loss": -0.0215,
      "num_tokens": 32207174.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.12072179466485977,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 752.4375,
      "completions/mean_terminated_length": 702.1481323242188,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 3.382716049382716,
      "grad_norm": 0.16629034886974875,
      "kl": 0.26416015625,
      "learning_rate": 1.2557771906322704e-07,
      "loss": 0.0003,
      "num_tokens": 32237924.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 687.0,
      "completions/mean_terminated_length": 652.137939453125,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 3.3858024691358026,
      "grad_norm": 0.013630451196816433,
      "kl": 0.266845703125,
      "learning_rate": 1.2514434661445706e-07,
      "loss": 0.0003,
      "num_tokens": 32266056.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 788.3125,
      "completions/mean_terminated_length": 709.75,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 3.388888888888889,
      "grad_norm": 1.125990052255359,
      "kl": 0.2408447265625,
      "learning_rate": 1.2471147345864672e-07,
      "loss": -0.0432,
      "num_tokens": 32297386.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2222922444343567,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 763.09375,
      "completions/mean_terminated_length": 725.8214721679688,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 3.3919753086419755,
      "grad_norm": 0.9748479973086155,
      "kl": 0.253173828125,
      "learning_rate": 1.2427910132683928e-07,
      "loss": -0.069,
      "num_tokens": 32328577.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1099
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 705.9375,
      "completions/mean_terminated_length": 673.0344848632812,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 3.3950617283950617,
      "grad_norm": 1.1069650475836328,
      "kl": 0.2342529296875,
      "learning_rate": 1.2384723194807408e-07,
      "loss": 0.092,
      "num_tokens": 32357383.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1355731189250946,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 716.21875,
      "completions/mean_terminated_length": 659.2222290039062,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 3.398148148148148,
      "grad_norm": 1.0494479584682614,
      "kl": 0.273193359375,
      "learning_rate": 1.234158670493803e-07,
      "loss": 0.0144,
      "num_tokens": 32386702.0,
      "reward": 0.0,
      "reward_std": 0.188789963722229,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 757.96875,
      "completions/mean_terminated_length": 708.7037353515625,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 3.4012345679012346,
      "grad_norm": 2.099566012880971,
      "kl": 0.2247314453125,
      "learning_rate": 1.229850083557695e-07,
      "loss": 0.0934,
      "num_tokens": 32418061.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 772.125,
      "completions/mean_terminated_length": 714.0,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 3.4043209876543212,
      "grad_norm": 1.4716119220150468,
      "kl": 0.2271728515625,
      "learning_rate": 1.2255465759022913e-07,
      "loss": -0.0753,
      "num_tokens": 32449585.0,
      "reward": 0.0,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 782.21875,
      "completions/mean_terminated_length": 714.5199584960938,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.4074074074074074,
      "grad_norm": 0.5752351183998141,
      "kl": 0.271484375,
      "learning_rate": 1.2212481647371542e-07,
      "loss": -0.0129,
      "num_tokens": 32481344.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 767.3125,
      "completions/mean_terminated_length": 681.75,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 3.4104938271604937,
      "grad_norm": 0.017090939602922153,
      "kl": 0.2645263671875,
      "learning_rate": 1.2169548672514625e-07,
      "loss": 0.0003,
      "num_tokens": 32512458.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 788.15625,
      "completions/mean_terminated_length": 733.7307739257812,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 3.4135802469135803,
      "grad_norm": 0.9463338151541665,
      "kl": 0.24658203125,
      "learning_rate": 1.2126667006139495e-07,
      "loss": -0.007,
      "num_tokens": 32544003.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 877.0,
      "completions/mean_length": 787.59375,
      "completions/mean_terminated_length": 708.7916870117188,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 3.4166666666666665,
      "grad_norm": 0.7856020656526064,
      "kl": 0.2403564453125,
      "learning_rate": 1.208383681972829e-07,
      "loss": -0.0307,
      "num_tokens": 32576406.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 769.625,
      "completions/mean_terminated_length": 743.3103637695312,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 3.419753086419753,
      "grad_norm": 0.5092206644316002,
      "kl": 0.2269287109375,
      "learning_rate": 1.2041058284557277e-07,
      "loss": 0.0238,
      "num_tokens": 32607754.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 794.6875,
      "completions/mean_terminated_length": 730.47998046875,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 3.4228395061728394,
      "grad_norm": 1.3536454292536797,
      "kl": 0.2601318359375,
      "learning_rate": 1.1998331571696162e-07,
      "loss": -0.0313,
      "num_tokens": 32639904.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1885068714618683,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 736.65625,
      "completions/mean_terminated_length": 695.607177734375,
      "completions/min_length": 540.0,
      "completions/min_terminated_length": 540.0,
      "epoch": 3.425925925925926,
      "grad_norm": 0.9080027433099632,
      "kl": 0.21630859375,
      "learning_rate": 1.1955656852007438e-07,
      "loss": 0.0082,
      "num_tokens": 32669957.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 740.71875,
      "completions/mean_terminated_length": 711.413818359375,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 3.4290123456790123,
      "grad_norm": 1.20411101267003,
      "kl": 0.25634765625,
      "learning_rate": 1.1913034296145669e-07,
      "loss": -0.0067,
      "num_tokens": 32700268.0,
      "reward": 0.0,
      "reward_std": 0.14057862758636475,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 711.03125,
      "completions/mean_terminated_length": 653.0740966796875,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 3.432098765432099,
      "grad_norm": 1.5756966519011866,
      "kl": 0.245361328125,
      "learning_rate": 1.1870464074556816e-07,
      "loss": -0.032,
      "num_tokens": 32729513.0,
      "reward": 0.0,
      "reward_std": 0.15422560274600983,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 729.375,
      "completions/mean_terminated_length": 698.8965454101562,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 3.435185185185185,
      "grad_norm": 1.3969578411745558,
      "kl": 0.2464599609375,
      "learning_rate": 1.1827946357477559e-07,
      "loss": -0.0242,
      "num_tokens": 32759557.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 743.25,
      "completions/mean_terminated_length": 691.25927734375,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.4382716049382718,
      "grad_norm": 0.6396342570664947,
      "kl": 0.2535400390625,
      "learning_rate": 1.1785481314934618e-07,
      "loss": -0.027,
      "num_tokens": 32789885.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 775.25,
      "completions/mean_terminated_length": 692.3333740234375,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 3.441358024691358,
      "grad_norm": 1.643829047253925,
      "kl": 0.2890625,
      "learning_rate": 1.1743069116744064e-07,
      "loss": 0.0054,
      "num_tokens": 32821453.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.18929949402809143,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 775.375,
      "completions/mean_terminated_length": 678.0869750976562,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 3.4444444444444446,
      "grad_norm": 0.9569493190879544,
      "kl": 0.1943359375,
      "learning_rate": 1.1700709932510656e-07,
      "loss": -0.0106,
      "num_tokens": 32852733.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 760.75,
      "completions/mean_terminated_length": 700.0,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 3.447530864197531,
      "grad_norm": 2.4750548674904094,
      "kl": 0.412353515625,
      "learning_rate": 1.1658403931627125e-07,
      "loss": -0.0038,
      "num_tokens": 32884169.0,
      "reward": 0.0,
      "reward_std": 0.2040814757347107,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 713.65625,
      "completions/mean_terminated_length": 703.6451416015625,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 3.450617283950617,
      "grad_norm": 1.4756793797810057,
      "kl": 0.2459716796875,
      "learning_rate": 1.1616151283273565e-07,
      "loss": 0.031,
      "num_tokens": 32913474.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.25264814496040344,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 817.4375,
      "completions/mean_terminated_length": 736.6087036132812,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 3.4537037037037037,
      "grad_norm": 1.2820093189019142,
      "kl": 0.240966796875,
      "learning_rate": 1.1573952156416672e-07,
      "loss": -0.0057,
      "num_tokens": 32946588.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 820.0,
      "completions/mean_length": 768.46875,
      "completions/mean_terminated_length": 683.2916870117188,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 3.45679012345679,
      "grad_norm": 1.1228896539965887,
      "kl": 0.2550048828125,
      "learning_rate": 1.1531806719809142e-07,
      "loss": 0.0296,
      "num_tokens": 32977639.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.15779843926429749,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 769.625,
      "completions/mean_terminated_length": 743.3103637695312,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 3.4598765432098766,
      "grad_norm": 2.1765833178857745,
      "kl": 0.2271728515625,
      "learning_rate": 1.1489715141988954e-07,
      "loss": -0.1252,
      "num_tokens": 33008415.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.17488928139209747,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 872.0,
      "completions/mean_length": 745.46875,
      "completions/mean_terminated_length": 681.1923217773438,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 3.462962962962963,
      "grad_norm": 1.4780432846675047,
      "kl": 0.2430419921875,
      "learning_rate": 1.1447677591278715e-07,
      "loss": -0.0929,
      "num_tokens": 33038698.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 714.125,
      "completions/mean_terminated_length": 669.857177734375,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 3.4660493827160495,
      "grad_norm": 0.9649833911831643,
      "kl": 0.26953125,
      "learning_rate": 1.1405694235784972e-07,
      "loss": -0.0136,
      "num_tokens": 33067386.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.08606424182653427,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 784.9375,
      "completions/mean_terminated_length": 676.2727661132812,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 3.4691358024691357,
      "grad_norm": 1.0868646245244866,
      "kl": 0.250244140625,
      "learning_rate": 1.1363765243397555e-07,
      "loss": -0.0037,
      "num_tokens": 33099024.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 778.78125,
      "completions/mean_terminated_length": 697.0416870117188,
      "completions/min_length": 499.0,
      "completions/min_terminated_length": 499.0,
      "epoch": 3.4722222222222223,
      "grad_norm": 1.1089002532131813,
      "kl": 0.242919921875,
      "learning_rate": 1.1321890781788884e-07,
      "loss": -0.0167,
      "num_tokens": 33130281.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20304615795612335,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 736.125,
      "completions/mean_terminated_length": 682.8148193359375,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 3.4753086419753085,
      "grad_norm": 1.5891742761815362,
      "kl": 0.2274169921875,
      "learning_rate": 1.1280071018413326e-07,
      "loss": 0.0249,
      "num_tokens": 33160353.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.17188216745853424,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 733.40625,
      "completions/mean_terminated_length": 666.34619140625,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 3.478395061728395,
      "grad_norm": 0.6745362013842914,
      "kl": 0.206787109375,
      "learning_rate": 1.1238306120506505e-07,
      "loss": -0.0042,
      "num_tokens": 33190170.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 806.8125,
      "completions/mean_terminated_length": 676.5,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 3.4814814814814814,
      "grad_norm": 1.0329730099114092,
      "kl": 0.2452392578125,
      "learning_rate": 1.1196596255084648e-07,
      "loss": 0.0906,
      "num_tokens": 33222672.0,
      "reward": 0.0,
      "reward_std": 0.1512327343225479,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 711.40625,
      "completions/mean_terminated_length": 623.8800048828125,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 3.484567901234568,
      "grad_norm": 0.5740854501659524,
      "kl": 0.2684326171875,
      "learning_rate": 1.11549415889439e-07,
      "loss": 0.0049,
      "num_tokens": 33252173.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 737.84375,
      "completions/mean_terminated_length": 625.8695678710938,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 3.4876543209876543,
      "grad_norm": 1.4283067364278836,
      "kl": 0.2694091796875,
      "learning_rate": 1.1113342288659683e-07,
      "loss": 0.0238,
      "num_tokens": 33282448.0,
      "reward": 0.0,
      "reward_std": 0.22223833203315735,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 707.53125,
      "completions/mean_terminated_length": 674.7930908203125,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 3.490740740740741,
      "grad_norm": 1.431448484274639,
      "kl": 0.2420654296875,
      "learning_rate": 1.1071798520585979e-07,
      "loss": -0.0524,
      "num_tokens": 33311209.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.23882049322128296,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 770.71875,
      "completions/mean_terminated_length": 723.8148193359375,
      "completions/min_length": 519.0,
      "completions/min_terminated_length": 519.0,
      "epoch": 3.493827160493827,
      "grad_norm": 0.6104158682632964,
      "kl": 0.2403564453125,
      "learning_rate": 1.1030310450854729e-07,
      "loss": 0.0239,
      "num_tokens": 33342608.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 753.46875,
      "completions/mean_terminated_length": 703.370361328125,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 512.0,
      "epoch": 3.496913580246914,
      "grad_norm": 1.4568945591861568,
      "kl": 0.2320556640625,
      "learning_rate": 1.0988878245375138e-07,
      "loss": -0.1062,
      "num_tokens": 33373091.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 753.25,
      "completions/mean_terminated_length": 703.1111450195312,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 3.5,
      "grad_norm": 0.012419470848946491,
      "kl": 0.2547607421875,
      "learning_rate": 1.094750206983299e-07,
      "loss": 0.0003,
      "num_tokens": 33403499.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 831.0,
      "completions/mean_length": 698.75,
      "completions/mean_terminated_length": 665.1034545898438,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 3.503086419753086,
      "grad_norm": 1.810029084612459,
      "kl": 0.2451171875,
      "learning_rate": 1.0906182089690025e-07,
      "loss": 0.0225,
      "num_tokens": 33431835.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20778873562812805,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 818.96875,
      "completions/mean_terminated_length": 771.6538696289062,
      "completions/min_length": 541.0,
      "completions/min_terminated_length": 541.0,
      "epoch": 3.506172839506173,
      "grad_norm": 0.732214077066824,
      "kl": 0.2076416015625,
      "learning_rate": 1.0864918470183258e-07,
      "loss": -0.0132,
      "num_tokens": 33464690.0,
      "reward": 0.0,
      "reward_std": 0.12621738016605377,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 732.65625,
      "completions/mean_terminated_length": 691.0357666015625,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 3.5092592592592595,
      "grad_norm": 1.0566014922370506,
      "kl": 0.2276611328125,
      "learning_rate": 1.0823711376324313e-07,
      "loss": 0.0285,
      "num_tokens": 33494475.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 691.3125,
      "completions/mean_terminated_length": 656.8965454101562,
      "completions/min_length": 377.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 3.5123456790123457,
      "grad_norm": 0.011678948078310923,
      "kl": 0.2423095703125,
      "learning_rate": 1.0782560972898783e-07,
      "loss": 0.0002,
      "num_tokens": 33523077.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 813.40625,
      "completions/mean_terminated_length": 764.8077392578125,
      "completions/min_length": 591.0,
      "completions/min_terminated_length": 591.0,
      "epoch": 3.515432098765432,
      "grad_norm": 1.4078196528096074,
      "kl": 0.2655029296875,
      "learning_rate": 1.0741467424465544e-07,
      "loss": -0.0697,
      "num_tokens": 33555682.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 779.8125,
      "completions/mean_terminated_length": 744.9285888671875,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 3.5185185185185186,
      "grad_norm": 1.0780273497558357,
      "kl": 0.21923828125,
      "learning_rate": 1.0700430895356119e-07,
      "loss": 0.0089,
      "num_tokens": 33587388.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15889893472194672,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 770.8125,
      "completions/mean_terminated_length": 734.6428833007812,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 3.521604938271605,
      "grad_norm": 0.9076270681440747,
      "kl": 0.2550048828125,
      "learning_rate": 1.0659451549674018e-07,
      "loss": -0.0297,
      "num_tokens": 33618354.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1141
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 835.6875,
      "completions/mean_terminated_length": 750.0909423828125,
      "completions/min_length": 563.0,
      "completions/min_terminated_length": 563.0,
      "epoch": 3.5246913580246915,
      "grad_norm": 1.2267087063974567,
      "kl": NaN,
      "learning_rate": 1.0618529551294053e-07,
      "loss": 0.0709,
      "num_tokens": 33651484.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.14866770803928375,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 757.9375,
      "completions/mean_terminated_length": 719.9285888671875,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 3.5277777777777777,
      "grad_norm": 1.2997993879329892,
      "kl": 0.2279052734375,
      "learning_rate": 1.0577665063861735e-07,
      "loss": 0.0211,
      "num_tokens": 33681902.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.17986111342906952,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 824.0625,
      "completions/mean_terminated_length": 745.8261108398438,
      "completions/min_length": 508.0,
      "completions/min_terminated_length": 508.0,
      "epoch": 3.5308641975308643,
      "grad_norm": 0.008235207322554906,
      "kl": 0.2332763671875,
      "learning_rate": 1.0536858250792582e-07,
      "loss": 0.0002,
      "num_tokens": 33715040.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 788.6875,
      "completions/mean_terminated_length": 745.1111450195312,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 3.5339506172839505,
      "grad_norm": 0.9345907340887593,
      "kl": 0.2178955078125,
      "learning_rate": 1.0496109275271456e-07,
      "loss": -0.0089,
      "num_tokens": 33746670.0,
      "reward": 0.0,
      "reward_std": 0.15422803163528442,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 799.125,
      "completions/mean_terminated_length": 711.1304321289062,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 3.537037037037037,
      "grad_norm": 0.7607013714831201,
      "kl": 0.2451171875,
      "learning_rate": 1.0455418300251953e-07,
      "loss": -0.0066,
      "num_tokens": 33779046.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 764.53125,
      "completions/mean_terminated_length": 704.6538696289062,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 3.5401234567901234,
      "grad_norm": 1.7747619389516969,
      "kl": 0.226806640625,
      "learning_rate": 1.0414785488455718e-07,
      "loss": 0.0338,
      "num_tokens": 33810175.0,
      "reward": -1.1175870895385742e-08,
      "reward_std": 0.312080442905426,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 1147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 872.0,
      "completions/mean_length": 687.46875,
      "completions/mean_terminated_length": 676.6128540039062,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 3.5432098765432096,
      "grad_norm": 0.792813190309275,
      "kl": 0.2557373046875,
      "learning_rate": 1.0374211002371808e-07,
      "loss": 0.0105,
      "num_tokens": 33838618.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 703.78125,
      "completions/mean_terminated_length": 658.0357666015625,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 3.5462962962962963,
      "grad_norm": 1.266345128114868,
      "kl": 0.2593994140625,
      "learning_rate": 1.0333695004256035e-07,
      "loss": 0.0295,
      "num_tokens": 33867667.0,
      "reward": -4.132743924856186e-09,
      "reward_std": 0.13894928991794586,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -2.9103830456733704e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.0,
      "completions/mean_length": 765.25,
      "completions/mean_terminated_length": 717.3333129882812,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 3.549382716049383,
      "grad_norm": 1.3837416037283727,
      "kl": 0.2548828125,
      "learning_rate": 1.0293237656130304e-07,
      "loss": 0.0193,
      "num_tokens": 33899055.0,
      "reward": 0.0,
      "reward_std": 0.24888131022453308,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 778.375,
      "completions/mean_terminated_length": 721.6923217773438,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 3.552469135802469,
      "grad_norm": 0.67763339123376,
      "kl": 0.2357177734375,
      "learning_rate": 1.0252839119782006e-07,
      "loss": -0.0031,
      "num_tokens": 33930415.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 780.0625,
      "completions/mean_terminated_length": 711.760009765625,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 3.5555555555555554,
      "grad_norm": 1.2368525690198535,
      "kl": 0.2451171875,
      "learning_rate": 1.0212499556763335e-07,
      "loss": -0.0025,
      "num_tokens": 33962165.0,
      "reward": 0.0,
      "reward_std": 0.16599814593791962,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 737.84375,
      "completions/mean_terminated_length": 718.7667236328125,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 3.558641975308642,
      "grad_norm": 1.1274774311721085,
      "kl": 0.258544921875,
      "learning_rate": 1.017221912839065e-07,
      "loss": -0.0458,
      "num_tokens": 33992228.0,
      "reward": 0.0,
      "reward_std": 0.16356289386749268,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 689.15625,
      "completions/mean_terminated_length": 654.5172119140625,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 3.5617283950617287,
      "grad_norm": 1.6189419036848904,
      "kl": 0.2236328125,
      "learning_rate": 1.0131997995743838e-07,
      "loss": 0.0808,
      "num_tokens": 34020237.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.21227142214775085,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 766.8125,
      "completions/mean_terminated_length": 719.1851806640625,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 3.564814814814815,
      "grad_norm": 1.5469509316154286,
      "kl": 0.2310791015625,
      "learning_rate": 1.0091836319665664e-07,
      "loss": -0.0053,
      "num_tokens": 34051039.0,
      "reward": 0.0,
      "reward_std": 0.18308551609516144,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 790.6875,
      "completions/mean_terminated_length": 725.3599853515625,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 3.567901234567901,
      "grad_norm": 1.6687135530647947,
      "kl": 0.2198486328125,
      "learning_rate": 1.0051734260761135e-07,
      "loss": 0.0836,
      "num_tokens": 34082773.0,
      "reward": 0.0,
      "reward_std": 0.21906724572181702,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 772.6875,
      "completions/mean_terminated_length": 726.1481323242188,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 3.5709876543209877,
      "grad_norm": 1.58301607500672,
      "kl": 0.2186279296875,
      "learning_rate": 1.0011691979396827e-07,
      "loss": -0.0523,
      "num_tokens": 34114351.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 795.5625,
      "completions/mean_terminated_length": 742.84619140625,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 3.574074074074074,
      "grad_norm": 0.5842581224367376,
      "kl": 0.266357421875,
      "learning_rate": 9.971709635700301e-08,
      "loss": -0.0228,
      "num_tokens": 34146501.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 718.875,
      "completions/mean_terminated_length": 675.2857666015625,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 3.5771604938271606,
      "grad_norm": 1.4512661129081734,
      "kl": 0.26611328125,
      "learning_rate": 9.931787389559393e-08,
      "loss": -0.042,
      "num_tokens": 34175725.0,
      "reward": 0.0,
      "reward_std": 0.15134452283382416,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 781.375,
      "completions/mean_terminated_length": 746.7142944335938,
      "completions/min_length": 553.0,
      "completions/min_terminated_length": 553.0,
      "epoch": 3.580246913580247,
      "grad_norm": 1.0641804795499572,
      "kl": 0.211181640625,
      "learning_rate": 9.891925400621642e-08,
      "loss": -0.0149,
      "num_tokens": 34207293.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.15662670135498047,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 777.09375,
      "completions/mean_terminated_length": 720.1154174804688,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 3.5833333333333335,
      "grad_norm": 0.7428261939462992,
      "kl": 0.2603759765625,
      "learning_rate": 9.852123828293612e-08,
      "loss": -0.0141,
      "num_tokens": 34239224.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 781.03125,
      "completions/mean_terminated_length": 724.9615478515625,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 3.5864197530864197,
      "grad_norm": 0.6274682781569767,
      "kl": 0.23388671875,
      "learning_rate": 9.812382831740259e-08,
      "loss": -0.0107,
      "num_tokens": 34271161.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 800.53125,
      "completions/mean_terminated_length": 759.1481323242188,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 3.5895061728395063,
      "grad_norm": 2.40200813673398,
      "kl": 0.225341796875,
      "learning_rate": 9.772702569884301e-08,
      "loss": -0.1205,
      "num_tokens": 34303158.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.19116097688674927,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 755.71875,
      "completions/mean_terminated_length": 650.7391357421875,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 3.5925925925925926,
      "grad_norm": 30.56014032933971,
      "kl": 5.2987060546875,
      "learning_rate": 9.733083201405578e-08,
      "loss": -0.0124,
      "num_tokens": 34334141.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 807.5625,
      "completions/mean_terminated_length": 709.1818237304688,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 3.5956790123456788,
      "grad_norm": 1.4681208819627474,
      "kl": 0.254150390625,
      "learning_rate": 9.693524884740425e-08,
      "loss": -0.0296,
      "num_tokens": 34366283.0,
      "reward": 0.0,
      "reward_std": 0.2006455361843109,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 765.3125,
      "completions/mean_terminated_length": 692.8800048828125,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 3.5987654320987654,
      "grad_norm": 1.6054973971152977,
      "kl": 0.2462158203125,
      "learning_rate": 9.654027778081042e-08,
      "loss": -0.0172,
      "num_tokens": 34397005.0,
      "reward": 0.0,
      "reward_std": 0.2863330543041229,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 702.59375,
      "completions/mean_terminated_length": 692.2257690429688,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 3.601851851851852,
      "grad_norm": 0.00985929988231579,
      "kl": 0.2557373046875,
      "learning_rate": 9.614592039374817e-08,
      "loss": 0.0003,
      "num_tokens": 34425944.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 724.9375,
      "completions/mean_terminated_length": 705.0000610351562,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 3.6049382716049383,
      "grad_norm": 0.5216442638898365,
      "kl": 0.2509765625,
      "learning_rate": 9.575217826323761e-08,
      "loss": 0.0341,
      "num_tokens": 34455470.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 930.0,
      "completions/mean_length": 743.6875,
      "completions/mean_terminated_length": 679.0,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 3.6080246913580245,
      "grad_norm": 0.7589034234898857,
      "kl": 0.2447509765625,
      "learning_rate": 9.535905296383848e-08,
      "loss": -0.0125,
      "num_tokens": 34485752.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 708.4375,
      "completions/mean_terminated_length": 687.4000244140625,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 3.611111111111111,
      "grad_norm": 1.4331379030331497,
      "kl": 0.23388671875,
      "learning_rate": 9.496654606764373e-08,
      "loss": 0.0319,
      "num_tokens": 34514542.0,
      "reward": 0.0,
      "reward_std": 0.21092897653579712,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 751.125,
      "completions/mean_terminated_length": 700.5925903320312,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 3.6141975308641974,
      "grad_norm": 0.011370511822070762,
      "kl": 0.2593994140625,
      "learning_rate": 9.457465914427326e-08,
      "loss": 0.0003,
      "num_tokens": 34545034.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 744.25,
      "completions/mean_terminated_length": 725.6000366210938,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 3.617283950617284,
      "grad_norm": 1.3221539497629224,
      "kl": 0.234375,
      "learning_rate": 9.418339376086785e-08,
      "loss": -0.1029,
      "num_tokens": 34575358.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 842.5625,
      "completions/mean_terminated_length": 718.4210815429688,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 3.6203703703703702,
      "grad_norm": 0.9350894369204988,
      "kl": 0.23699951171875,
      "learning_rate": 9.379275148208276e-08,
      "loss": -0.0411,
      "num_tokens": 34609080.0,
      "reward": 0.0,
      "reward_std": 0.1437978297472,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 730.34375,
      "completions/mean_terminated_length": 688.3928833007812,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 3.623456790123457,
      "grad_norm": 1.396211505512646,
      "kl": 0.2403564453125,
      "learning_rate": 9.340273387008152e-08,
      "loss": 0.0083,
      "num_tokens": 34639011.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15331010520458221,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 755.6875,
      "completions/mean_terminated_length": 706.0,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 3.626543209876543,
      "grad_norm": 1.2901908717336628,
      "kl": 0.2454833984375,
      "learning_rate": 9.30133424845294e-08,
      "loss": -0.0054,
      "num_tokens": 34669657.0,
      "reward": 0.0,
      "reward_std": 0.17049743235111237,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 721.78125,
      "completions/mean_terminated_length": 690.5172119140625,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 3.6296296296296298,
      "grad_norm": 1.2395269582660546,
      "kl": 0.2843017578125,
      "learning_rate": 9.26245788825877e-08,
      "loss": 0.0195,
      "num_tokens": 34698830.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 769.21875,
      "completions/mean_terminated_length": 722.0370483398438,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 3.632716049382716,
      "grad_norm": 2.0457587536858948,
      "kl": 0.2271728515625,
      "learning_rate": 9.223644461890711e-08,
      "loss": -0.0787,
      "num_tokens": 34729997.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.21996617317199707,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 643.125,
      "completions/mean_terminated_length": 630.8386840820312,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 3.6358024691358026,
      "grad_norm": 1.0578775835202625,
      "kl": 0.265625,
      "learning_rate": 9.184894124562162e-08,
      "loss": 0.0145,
      "num_tokens": 34756577.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 696.03125,
      "completions/mean_terminated_length": 635.2963256835938,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 3.638888888888889,
      "grad_norm": 0.9553170792799089,
      "kl": 0.2421875,
      "learning_rate": 9.146207031234232e-08,
      "loss": 0.0113,
      "num_tokens": 34784818.0,
      "reward": 0.0,
      "reward_std": 0.15482844412326813,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 741.96875,
      "completions/mean_terminated_length": 701.6785888671875,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 3.6419753086419755,
      "grad_norm": 1.3298136293147023,
      "kl": 0.283203125,
      "learning_rate": 9.107583336615124e-08,
      "loss": -0.0201,
      "num_tokens": 34814953.0,
      "reward": 0.0,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 746.09375,
      "completions/mean_terminated_length": 681.9615478515625,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 3.6450617283950617,
      "grad_norm": 0.01177057179996018,
      "kl": 0.22705078125,
      "learning_rate": 9.069023195159505e-08,
      "loss": 0.0002,
      "num_tokens": 34845648.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 742.0,
      "completions/mean_terminated_length": 689.7777709960938,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 3.648148148148148,
      "grad_norm": 0.008294800869948358,
      "kl": 0.2191162109375,
      "learning_rate": 9.030526761067911e-08,
      "loss": 0.0002,
      "num_tokens": 34875980.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 774.0,
      "completions/mean_terminated_length": 676.1739501953125,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 3.6512345679012346,
      "grad_norm": 1.635013960586178,
      "kl": 0.2298583984375,
      "learning_rate": 8.992094188286081e-08,
      "loss": -0.0944,
      "num_tokens": 34907452.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 705.3125,
      "completions/mean_terminated_length": 672.3448486328125,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 3.6543209876543212,
      "grad_norm": 0.007816248613254148,
      "kl": 0.2305908203125,
      "learning_rate": 8.953725630504419e-08,
      "loss": 0.0002,
      "num_tokens": 34935962.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 809.03125,
      "completions/mean_terminated_length": 711.3181762695312,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 3.6574074074074074,
      "grad_norm": 1.2313276526376022,
      "kl": 0.24072265625,
      "learning_rate": 8.915421241157292e-08,
      "loss": -0.053,
      "num_tokens": 34968731.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 807.1875,
      "completions/mean_terminated_length": 757.1538696289062,
      "completions/min_length": 546.0,
      "completions/min_terminated_length": 546.0,
      "epoch": 3.6604938271604937,
      "grad_norm": 1.1385373165629489,
      "kl": 0.25830078125,
      "learning_rate": 8.877181173422487e-08,
      "loss": -0.0018,
      "num_tokens": 35001105.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 774.34375,
      "completions/mean_terminated_length": 716.7307739257812,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 3.6635802469135803,
      "grad_norm": 1.30860574782242,
      "kl": 0.2275390625,
      "learning_rate": 8.839005580220574e-08,
      "loss": -0.0199,
      "num_tokens": 35032208.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 735.78125,
      "completions/mean_terminated_length": 682.4074096679688,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 3.6666666666666665,
      "grad_norm": 1.3666539395504609,
      "kl": 0.282470703125,
      "learning_rate": 8.800894614214274e-08,
      "loss": -0.0402,
      "num_tokens": 35062193.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.14590159058570862,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.3969838619232178e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 720.40625,
      "completions/mean_terminated_length": 700.1666870117188,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 3.669753086419753,
      "grad_norm": 1.2104511750175704,
      "kl": 0.248779296875,
      "learning_rate": 8.762848427807882e-08,
      "loss": -0.0582,
      "num_tokens": 35091458.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.17130933701992035,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 803.875,
      "completions/mean_terminated_length": 742.239990234375,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 3.6728395061728394,
      "grad_norm": 0.7273489481754196,
      "kl": 0.2391357421875,
      "learning_rate": 8.724867173146633e-08,
      "loss": -0.0032,
      "num_tokens": 35123586.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 776.375,
      "completions/mean_terminated_length": 719.2307739257812,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 3.675925925925926,
      "grad_norm": 0.654432598917328,
      "kl": 0.27197265625,
      "learning_rate": 8.686951002116111e-08,
      "loss": 0.025,
      "num_tokens": 35154802.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 749.0625,
      "completions/mean_terminated_length": 709.7857666015625,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.6790123456790123,
      "grad_norm": 0.5660220881502624,
      "kl": 0.25634765625,
      "learning_rate": 8.649100066341614e-08,
      "loss": 0.0234,
      "num_tokens": 35184896.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 778.84375,
      "completions/mean_terminated_length": 722.269287109375,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 3.682098765432099,
      "grad_norm": 0.7583301748186921,
      "kl": 0.20263671875,
      "learning_rate": 8.611314517187584e-08,
      "loss": -0.0194,
      "num_tokens": 35216403.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 737.78125,
      "completions/mean_terminated_length": 671.7307739257812,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 3.685185185185185,
      "grad_norm": 1.406099040003787,
      "kl": 0.239013671875,
      "learning_rate": 8.573594505756982e-08,
      "loss": 0.0091,
      "num_tokens": 35246440.0,
      "reward": 0.0,
      "reward_std": 0.187990203499794,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 731.0625,
      "completions/mean_terminated_length": 700.7586059570312,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 3.6882716049382713,
      "grad_norm": 0.9753640422973019,
      "kl": 0.2176513671875,
      "learning_rate": 8.535940182890685e-08,
      "loss": 0.0288,
      "num_tokens": 35276530.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1585690975189209,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 842.25,
      "completions/mean_terminated_length": 733.2000122070312,
      "completions/min_length": 546.0,
      "completions/min_terminated_length": 546.0,
      "epoch": 3.691358024691358,
      "grad_norm": 0.8454736311530938,
      "kl": 0.216064453125,
      "learning_rate": 8.498351699166889e-08,
      "loss": -0.0311,
      "num_tokens": 35310050.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 790.59375,
      "completions/mean_terminated_length": 736.7307739257812,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 3.6944444444444446,
      "grad_norm": 0.692100861738555,
      "kl": 0.267333984375,
      "learning_rate": 8.460829204900483e-08,
      "loss": 0.0209,
      "num_tokens": 35341577.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 770.6875,
      "completions/mean_terminated_length": 723.7777709960938,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 3.697530864197531,
      "grad_norm": 0.8807008328975766,
      "kl": 0.2099609375,
      "learning_rate": 8.423372850142482e-08,
      "loss": -0.0115,
      "num_tokens": 35372319.0,
      "reward": 0.05624999850988388,
      "reward_std": 0.06495190411806107,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0625,
      "rewards/logprob_reward/std": 0.24593468010425568,
      "step": 1198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 786.75,
      "completions/mean_terminated_length": 707.6666870117188,
      "completions/min_length": 566.0,
      "completions/min_terminated_length": 566.0,
      "epoch": 3.700617283950617,
      "grad_norm": 0.9863560532915724,
      "kl": 0.23974609375,
      "learning_rate": 8.385982784679416e-08,
      "loss": -0.018,
      "num_tokens": 35404119.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 735.4375,
      "completions/mean_terminated_length": 668.84619140625,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 3.7037037037037037,
      "grad_norm": 1.3594854957902904,
      "kl": 0.2215576171875,
      "learning_rate": 8.348659158032723e-08,
      "loss": -0.0005,
      "num_tokens": 35433557.0,
      "reward": 0.0,
      "reward_std": 0.2354062795639038,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 825.0625,
      "completions/mean_terminated_length": 758.75,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 3.7067901234567904,
      "grad_norm": 1.0290489160469656,
      "kl": 0.235107421875,
      "learning_rate": 8.311402119458138e-08,
      "loss": -0.0175,
      "num_tokens": 35467111.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 809.625,
      "completions/mean_terminated_length": 712.1818237304688,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 3.7098765432098766,
      "grad_norm": 0.9492204499282125,
      "kl": 0.250244140625,
      "learning_rate": 8.274211817945135e-08,
      "loss": -0.0568,
      "num_tokens": 35499867.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 791.46875,
      "completions/mean_terminated_length": 700.478271484375,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 3.712962962962963,
      "grad_norm": 1.0787016926730821,
      "kl": 0.228759765625,
      "learning_rate": 8.237088402216297e-08,
      "loss": -0.0023,
      "num_tokens": 35531610.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 705.0,
      "completions/mean_terminated_length": 672.0,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 3.7160493827160495,
      "grad_norm": 1.6027175561626164,
      "kl": 0.2474365234375,
      "learning_rate": 8.20003202072674e-08,
      "loss": -0.0895,
      "num_tokens": 35560542.0,
      "reward": -3.259629011154175e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 754.53125,
      "completions/mean_terminated_length": 679.0799560546875,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 3.7191358024691357,
      "grad_norm": 0.918710188396746,
      "kl": 0.2266845703125,
      "learning_rate": 8.163042821663507e-08,
      "loss": 0.0214,
      "num_tokens": 35590707.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 805.4375,
      "completions/mean_terminated_length": 732.5833740234375,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 3.7222222222222223,
      "grad_norm": 0.770015704172395,
      "kl": 0.24853515625,
      "learning_rate": 8.126120952944987e-08,
      "loss": -0.0007,
      "num_tokens": 35623469.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 800.46875,
      "completions/mean_terminated_length": 759.0740966796875,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 3.7253086419753085,
      "grad_norm": 1.2081184592114798,
      "kl": 0.213134765625,
      "learning_rate": 8.089266562220312e-08,
      "loss": 0.0188,
      "num_tokens": 35656264.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 866.0,
      "completions/mean_length": 690.65625,
      "completions/mean_terminated_length": 643.0357666015625,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 3.728395061728395,
      "grad_norm": 0.8269706640905476,
      "kl": 0.260498046875,
      "learning_rate": 8.052479796868784e-08,
      "loss": -0.0257,
      "num_tokens": 35684765.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 743.0625,
      "completions/mean_terminated_length": 691.0370483398438,
      "completions/min_length": 537.0,
      "completions/min_terminated_length": 537.0,
      "epoch": 3.7314814814814814,
      "grad_norm": 1.2367759816629091,
      "kl": 0.25732421875,
      "learning_rate": 8.015760803999244e-08,
      "loss": -0.0473,
      "num_tokens": 35714971.0,
      "reward": 0.0,
      "reward_std": 0.15867450833320618,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 815.0,
      "completions/mean_terminated_length": 756.47998046875,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 3.734567901234568,
      "grad_norm": 0.010125124854750943,
      "kl": 0.243896484375,
      "learning_rate": 7.979109730449552e-08,
      "loss": 0.0002,
      "num_tokens": 35748047.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 752.34375,
      "completions/mean_terminated_length": 724.2413940429688,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 3.7376543209876543,
      "grad_norm": 0.8217062489450325,
      "kl": 0.240966796875,
      "learning_rate": 7.942526722785927e-08,
      "loss": -0.0307,
      "num_tokens": 35778626.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 811.125,
      "completions/mean_terminated_length": 714.3636474609375,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 3.7407407407407405,
      "grad_norm": 1.5405233136020489,
      "kl": 0.2342529296875,
      "learning_rate": 7.906011927302417e-08,
      "loss": -0.078,
      "num_tokens": 35811414.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2160208821296692,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096889972687,
      "step": 1212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 706.6875,
      "completions/mean_terminated_length": 661.357177734375,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 3.743827160493827,
      "grad_norm": 1.0796459999583565,
      "kl": 0.2510986328125,
      "learning_rate": 7.869565490020288e-08,
      "loss": -0.0336,
      "num_tokens": 35840152.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1774420291185379,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 725.5,
      "completions/mean_terminated_length": 705.6000366210938,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 3.746913580246914,
      "grad_norm": 0.8397243257628163,
      "kl": 0.238525390625,
      "learning_rate": 7.833187556687443e-08,
      "loss": -0.0187,
      "num_tokens": 35869388.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.12777692079544067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 800.875,
      "completions/mean_terminated_length": 713.5652465820312,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 3.75,
      "grad_norm": 2.3875383809072157,
      "kl": 0.247802734375,
      "learning_rate": 7.796878272777835e-08,
      "loss": -0.0655,
      "num_tokens": 35901652.0,
      "reward": 1.280568540096283e-09,
      "reward_std": 0.1393391489982605,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.5133991837501526e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 799.96875,
      "completions/mean_terminated_length": 737.239990234375,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 3.753086419753086,
      "grad_norm": 0.9268055926606276,
      "kl": 0.2421875,
      "learning_rate": 7.760637783490906e-08,
      "loss": -0.009,
      "num_tokens": 35933607.0,
      "reward": 0.0,
      "reward_std": 0.15413051843643188,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 750.625,
      "completions/mean_terminated_length": 711.5714721679688,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.756172839506173,
      "grad_norm": 0.8706513221683194,
      "kl": 0.244384765625,
      "learning_rate": 7.724466233750961e-08,
      "loss": 0.0241,
      "num_tokens": 35963827.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 790.3125,
      "completions/mean_terminated_length": 736.3846435546875,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 3.7592592592592595,
      "grad_norm": 1.1387784304241038,
      "kl": 0.2215576171875,
      "learning_rate": 7.688363768206651e-08,
      "loss": 0.0682,
      "num_tokens": 35996437.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 743.75,
      "completions/mean_terminated_length": 714.7586059570312,
      "completions/min_length": 508.0,
      "completions/min_terminated_length": 508.0,
      "epoch": 3.7623456790123457,
      "grad_norm": 2.5365342314904638,
      "kl": 0.2642822265625,
      "learning_rate": 7.652330531230344e-08,
      "loss": -0.1692,
      "num_tokens": 36026337.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1642131358385086,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 768.03125,
      "completions/mean_terminated_length": 731.4642944335938,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 3.765432098765432,
      "grad_norm": 1.5158845625867385,
      "kl": 0.232666015625,
      "learning_rate": 7.616366666917571e-08,
      "loss": 0.0122,
      "num_tokens": 36057790.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.1783732920885086,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 750.75,
      "completions/mean_terminated_length": 687.6923217773438,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 3.7685185185185186,
      "grad_norm": 1.118494479169383,
      "kl": 0.2314453125,
      "learning_rate": 7.580472319086442e-08,
      "loss": 0.0502,
      "num_tokens": 36088270.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1539483666419983,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 732.25,
      "completions/mean_terminated_length": 722.8386840820312,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 3.771604938271605,
      "grad_norm": 0.8305719376898156,
      "kl": 0.229736328125,
      "learning_rate": 7.544647631277085e-08,
      "loss": -0.0166,
      "num_tokens": 36117882.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 738.4375,
      "completions/mean_terminated_length": 685.5555419921875,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 3.7746913580246915,
      "grad_norm": 1.0240970318607905,
      "kl": 0.2247314453125,
      "learning_rate": 7.508892746751034e-08,
      "loss": -0.0219,
      "num_tokens": 36148088.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 755.34375,
      "completions/mean_terminated_length": 680.1199951171875,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 3.7777777777777777,
      "grad_norm": 0.9011193535676861,
      "kl": 0.2557373046875,
      "learning_rate": 7.473207808490701e-08,
      "loss": -0.0197,
      "num_tokens": 36178907.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 778.0,
      "completions/mean_terminated_length": 721.2307739257812,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 3.7808641975308643,
      "grad_norm": 1.0150857603607386,
      "kl": 0.2393798828125,
      "learning_rate": 7.437592959198796e-08,
      "loss": 0.0188,
      "num_tokens": 36210039.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 739.9375,
      "completions/mean_terminated_length": 687.3333129882812,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 3.7839506172839505,
      "grad_norm": 0.88761027735542,
      "kl": 0.236572265625,
      "learning_rate": 7.402048341297718e-08,
      "loss": 0.0144,
      "num_tokens": 36239681.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 762.3125,
      "completions/mean_terminated_length": 713.8518676757812,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 3.787037037037037,
      "grad_norm": 2.6095830013259484,
      "kl": 0.263427734375,
      "learning_rate": 7.36657409692903e-08,
      "loss": -0.0196,
      "num_tokens": 36270887.0,
      "reward": 0.0,
      "reward_std": 0.21855804324150085,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 751.03125,
      "completions/mean_terminated_length": 722.7930908203125,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 3.7901234567901234,
      "grad_norm": 0.7237038686542401,
      "kl": 0.2569580078125,
      "learning_rate": 7.331170367952874e-08,
      "loss": 0.0165,
      "num_tokens": 36301424.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 766.84375,
      "completions/mean_terminated_length": 694.8399658203125,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.7932098765432096,
      "grad_norm": 0.951997830767936,
      "kl": 0.2742919921875,
      "learning_rate": 7.295837295947404e-08,
      "loss": 0.0099,
      "num_tokens": 36332239.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1255132108926773,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 777.46875,
      "completions/mean_terminated_length": 731.8148193359375,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 3.7962962962962963,
      "grad_norm": 1.3570407985352184,
      "kl": 0.2540283203125,
      "learning_rate": 7.260575022208218e-08,
      "loss": -0.0134,
      "num_tokens": 36363738.0,
      "reward": 0.0,
      "reward_std": 0.15896356105804443,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 773.40625,
      "completions/mean_terminated_length": 727.0,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 3.799382716049383,
      "grad_norm": 1.1077044651823722,
      "kl": 0.216552734375,
      "learning_rate": 7.225383687747789e-08,
      "loss": -0.0375,
      "num_tokens": 36395031.0,
      "reward": 0.0,
      "reward_std": 0.15849053859710693,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 777.96875,
      "completions/mean_terminated_length": 709.0799560546875,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 3.802469135802469,
      "grad_norm": 1.939508985822048,
      "kl": 0.201904296875,
      "learning_rate": 7.190263433294913e-08,
      "loss": 0.0532,
      "num_tokens": 36426290.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.2718256115913391,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 679.9375,
      "completions/mean_terminated_length": 668.8386840820312,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 3.8055555555555554,
      "grad_norm": 1.4813568857659922,
      "kl": 0.2130126953125,
      "learning_rate": 7.155214399294146e-08,
      "loss": 0.0022,
      "num_tokens": 36454252.0,
      "reward": 0.0,
      "reward_std": 0.18591031432151794,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 778.6875,
      "completions/mean_terminated_length": 722.0769653320312,
      "completions/min_length": 511.0,
      "completions/min_terminated_length": 511.0,
      "epoch": 3.808641975308642,
      "grad_norm": 1.4791837298848438,
      "kl": 0.19793701171875,
      "learning_rate": 7.120236725905215e-08,
      "loss": 0.0315,
      "num_tokens": 36485714.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.14229324460029602,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 734.4375,
      "completions/mean_terminated_length": 725.0967407226562,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 3.8117283950617287,
      "grad_norm": 1.1991852092814017,
      "kl": 0.2501220703125,
      "learning_rate": 7.085330553002494e-08,
      "loss": -0.0109,
      "num_tokens": 36515200.0,
      "reward": 0.0,
      "reward_std": 0.14888522028923035,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 721.125,
      "completions/mean_terminated_length": 689.7930908203125,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 3.814814814814815,
      "grad_norm": 1.6082348672108904,
      "kl": 0.2298583984375,
      "learning_rate": 7.05049602017444e-08,
      "loss": 0.0667,
      "num_tokens": 36544972.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15909549593925476,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 791.75,
      "completions/mean_terminated_length": 686.1818237304688,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 3.817901234567901,
      "grad_norm": 1.4981153782420504,
      "kl": 0.2171630859375,
      "learning_rate": 7.015733266722993e-08,
      "loss": -0.0476,
      "num_tokens": 36576864.0,
      "reward": -2.3283064365386963e-09,
      "reward_std": 0.2329539954662323,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 2.7939677238464355e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 738.9375,
      "completions/mean_terminated_length": 686.1481323242188,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 3.8209876543209877,
      "grad_norm": 1.256041190538623,
      "kl": 0.2286376953125,
      "learning_rate": 6.981042431663075e-08,
      "loss": 0.018,
      "num_tokens": 36607170.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15654632449150085,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 746.84375,
      "completions/mean_terminated_length": 718.1724243164062,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 3.824074074074074,
      "grad_norm": 1.0884165892836302,
      "kl": 0.2041015625,
      "learning_rate": 6.946423653722006e-08,
      "loss": -0.0175,
      "num_tokens": 36637745.0,
      "reward": 0.0,
      "reward_std": 0.15909849107265472,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 733.3125,
      "completions/mean_terminated_length": 691.7857666015625,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 3.8271604938271606,
      "grad_norm": 1.0050481325166605,
      "kl": 0.243896484375,
      "learning_rate": 6.911877071338942e-08,
      "loss": -0.0467,
      "num_tokens": 36667271.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 762.28125,
      "completions/mean_terminated_length": 713.8148193359375,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 3.830246913580247,
      "grad_norm": 1.1216072656358058,
      "kl": 0.2501220703125,
      "learning_rate": 6.877402822664352e-08,
      "loss": 0.0133,
      "num_tokens": 36697640.0,
      "reward": 0.0,
      "reward_std": 0.1578986495733261,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 758.5,
      "completions/mean_terminated_length": 709.3333129882812,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 3.8333333333333335,
      "grad_norm": 0.9022035573659892,
      "kl": 0.2386474609375,
      "learning_rate": 6.843001045559416e-08,
      "loss": 0.0079,
      "num_tokens": 36728568.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 703.75,
      "completions/mean_terminated_length": 644.4444580078125,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 3.8364197530864197,
      "grad_norm": 1.2986014931213434,
      "kl": 0.247802734375,
      "learning_rate": 6.808671877595524e-08,
      "loss": 0.0265,
      "num_tokens": 36757320.0,
      "reward": 0.0,
      "reward_std": 0.15040497481822968,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 714.3125,
      "completions/mean_terminated_length": 682.27587890625,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 3.8395061728395063,
      "grad_norm": 13.687712391070466,
      "kl": 1.268798828125,
      "learning_rate": 6.774415456053697e-08,
      "loss": -0.0369,
      "num_tokens": 36787386.0,
      "reward": -4.889443516731262e-09,
      "reward_std": 0.1524234265089035,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.259629011154175e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 714.9375,
      "completions/mean_terminated_length": 694.3333740234375,
      "completions/min_length": 515.0,
      "completions/min_terminated_length": 515.0,
      "epoch": 3.8425925925925926,
      "grad_norm": 1.0589863280587208,
      "kl": 0.2066650390625,
      "learning_rate": 6.740231917924053e-08,
      "loss": -0.0221,
      "num_tokens": 36816516.0,
      "reward": 0.0,
      "reward_std": 0.13008607923984528,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 754.96875,
      "completions/mean_terminated_length": 716.5357666015625,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 3.8456790123456788,
      "grad_norm": 1.4588364279218964,
      "kl": 0.241455078125,
      "learning_rate": 6.706121399905245e-08,
      "loss": 0.0053,
      "num_tokens": 36846823.0,
      "reward": 0.0,
      "reward_std": 0.15324005484580994,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 694.8125,
      "completions/mean_terminated_length": 672.86669921875,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 3.8487654320987654,
      "grad_norm": 0.651811657610569,
      "kl": 0.2752685546875,
      "learning_rate": 6.672084038403927e-08,
      "loss": -0.0013,
      "num_tokens": 36874885.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 774.65625,
      "completions/mean_terminated_length": 728.4815063476562,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 3.851851851851852,
      "grad_norm": 1.3869213160714808,
      "kl": 0.2127685546875,
      "learning_rate": 6.638119969534201e-08,
      "loss": 0.0236,
      "num_tokens": 36906034.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.18139660358428955,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 730.6875,
      "completions/mean_terminated_length": 688.7857666015625,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 3.8549382716049383,
      "grad_norm": 1.2346828352439247,
      "kl": 0.230224609375,
      "learning_rate": 6.604229329117064e-08,
      "loss": -0.0026,
      "num_tokens": 36935792.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 802.125,
      "completions/mean_terminated_length": 740.0,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 3.8580246913580245,
      "grad_norm": 1.0676462685125008,
      "kl": 0.239990234375,
      "learning_rate": 6.570412252679894e-08,
      "loss": 0.0158,
      "num_tokens": 36968488.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 783.40625,
      "completions/mean_terminated_length": 703.2083740234375,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 3.861111111111111,
      "grad_norm": 1.3215043546684153,
      "kl": 0.2652587890625,
      "learning_rate": 6.536668875455869e-08,
      "loss": 0.0101,
      "num_tokens": 37000453.0,
      "reward": 0.0,
      "reward_std": 0.16467581689357758,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -2.7939677238464355e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 742.5,
      "completions/mean_terminated_length": 690.370361328125,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 3.8641975308641974,
      "grad_norm": 1.528425807782001,
      "kl": 0.243896484375,
      "learning_rate": 6.502999332383465e-08,
      "loss": -0.0477,
      "num_tokens": 37030729.0,
      "reward": 0.0,
      "reward_std": 0.25724995136260986,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096293926239,
      "step": 1252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 773.8125,
      "completions/mean_terminated_length": 703.760009765625,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 3.867283950617284,
      "grad_norm": 1.4882893888851654,
      "kl": 0.239013671875,
      "learning_rate": 6.469403758105894e-08,
      "loss": -0.0675,
      "num_tokens": 37062319.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15071991086006165,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 726.96875,
      "completions/mean_terminated_length": 671.9629516601562,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 3.8703703703703702,
      "grad_norm": 0.9228504447247637,
      "kl": 0.2623291015625,
      "learning_rate": 6.435882286970556e-08,
      "loss": 0.0083,
      "num_tokens": 37091754.0,
      "reward": 0.0,
      "reward_std": 0.1724267452955246,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 758.4375,
      "completions/mean_terminated_length": 669.9166870117188,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 3.873456790123457,
      "grad_norm": 1.6608069921641562,
      "kl": 0.2398681640625,
      "learning_rate": 6.402435053028538e-08,
      "loss": -0.0788,
      "num_tokens": 37122412.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 762.03125,
      "completions/mean_terminated_length": 734.9310302734375,
      "completions/min_length": 532.0,
      "completions/min_terminated_length": 532.0,
      "epoch": 3.876543209876543,
      "grad_norm": 0.9179852850231466,
      "kl": 0.2252197265625,
      "learning_rate": 6.369062190034036e-08,
      "loss": -0.0393,
      "num_tokens": 37152981.0,
      "reward": 2.3283064365386963e-09,
      "reward_std": 0.1416376531124115,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 803.0,
      "completions/mean_terminated_length": 741.1199951171875,
      "completions/min_length": 595.0,
      "completions/min_terminated_length": 595.0,
      "epoch": 3.8796296296296298,
      "grad_norm": 2.185652774060378,
      "kl": 0.2210693359375,
      "learning_rate": 6.335763831443847e-08,
      "loss": -0.0881,
      "num_tokens": 37185369.0,
      "reward": 0.0,
      "reward_std": 0.1948545277118683,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 772.0,
      "completions/mean_terminated_length": 688.0,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 3.882716049382716,
      "grad_norm": 3.1329146865719437,
      "kl": 0.230712890625,
      "learning_rate": 6.302540110416837e-08,
      "loss": -0.2205,
      "num_tokens": 37216725.0,
      "reward": 0.0,
      "reward_std": 0.13944709300994873,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 834.125,
      "completions/mean_terminated_length": 734.6666870117188,
      "completions/min_length": 592.0,
      "completions/min_terminated_length": 592.0,
      "epoch": 3.8858024691358026,
      "grad_norm": 1.376970072779452,
      "kl": 0.2353515625,
      "learning_rate": 6.269391159813372e-08,
      "loss": -0.0172,
      "num_tokens": 37250437.0,
      "reward": 0.0,
      "reward_std": 0.15089517831802368,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 764.375,
      "completions/mean_terminated_length": 691.6799926757812,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 3.888888888888889,
      "grad_norm": 1.6380279375547084,
      "kl": 0.25048828125,
      "learning_rate": 6.236317112194844e-08,
      "loss": -0.0775,
      "num_tokens": 37282245.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 752.40625,
      "completions/mean_terminated_length": 702.1111450195312,
      "completions/min_length": 548.0,
      "completions/min_terminated_length": 548.0,
      "epoch": 3.8919753086419755,
      "grad_norm": 0.9837897169735943,
      "kl": 0.2252197265625,
      "learning_rate": 6.203318099823094e-08,
      "loss": -0.0268,
      "num_tokens": 37312658.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 723.78125,
      "completions/mean_terminated_length": 668.1851806640625,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 3.8950617283950617,
      "grad_norm": 1.3922457182827639,
      "kl": 0.252197265625,
      "learning_rate": 6.17039425465991e-08,
      "loss": -0.0769,
      "num_tokens": 37342367.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.12777692079544067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 766.8125,
      "completions/mean_terminated_length": 719.1851806640625,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 3.898148148148148,
      "grad_norm": 1.5101869396166963,
      "kl": 0.23046875,
      "learning_rate": 6.137545708366476e-08,
      "loss": -0.0927,
      "num_tokens": 37373441.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15856757760047913,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 744.5,
      "completions/mean_terminated_length": 704.5714721679688,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 3.9012345679012346,
      "grad_norm": 0.49481321176650966,
      "kl": 0.2091064453125,
      "learning_rate": 6.104772592302868e-08,
      "loss": -0.009,
      "num_tokens": 37403501.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 698.0,
      "completions/mean_terminated_length": 664.27587890625,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 3.9043209876543212,
      "grad_norm": 1.6360742195653148,
      "kl": 0.2374267578125,
      "learning_rate": 6.072075037527519e-08,
      "loss": 0.0097,
      "num_tokens": 37432249.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 738.84375,
      "completions/mean_terminated_length": 698.107177734375,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 3.9074074074074074,
      "grad_norm": 0.9290586479842646,
      "kl": 0.234130859375,
      "learning_rate": 6.039453174796699e-08,
      "loss": -0.0167,
      "num_tokens": 37462168.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.12777692079544067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 717.40625,
      "completions/mean_terminated_length": 696.9666748046875,
      "completions/min_length": 531.0,
      "completions/min_terminated_length": 531.0,
      "epoch": 3.9104938271604937,
      "grad_norm": 1.6258232379225002,
      "kl": 0.2481689453125,
      "learning_rate": 6.006907134563973e-08,
      "loss": 0.0463,
      "num_tokens": 37491149.0,
      "reward": 0.0,
      "reward_std": 0.17654143273830414,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 774.09375,
      "completions/mean_terminated_length": 690.7916870117188,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 3.9135802469135803,
      "grad_norm": 0.8252770862492023,
      "kl": 0.256103515625,
      "learning_rate": 5.974437046979711e-08,
      "loss": 0.0221,
      "num_tokens": 37522332.0,
      "reward": 0.0,
      "reward_std": 0.12888267636299133,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 746.28125,
      "completions/mean_terminated_length": 706.607177734375,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 3.9166666666666665,
      "grad_norm": 2.2402671095082014,
      "kl": 0.19781494140625,
      "learning_rate": 5.9420430418905435e-08,
      "loss": 0.2052,
      "num_tokens": 37552597.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 729.0625,
      "completions/mean_terminated_length": 674.4444580078125,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 3.919753086419753,
      "grad_norm": 1.0094757911550474,
      "kl": 0.2978515625,
      "learning_rate": 5.909725248838854e-08,
      "loss": -0.0091,
      "num_tokens": 37582503.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 760.15625,
      "completions/mean_terminated_length": 686.2799682617188,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 3.9228395061728394,
      "grad_norm": 1.0322754762656066,
      "kl": 0.2508544921875,
      "learning_rate": 5.877483797062255e-08,
      "loss": -0.0157,
      "num_tokens": 37613260.0,
      "reward": -4.656612873077393e-10,
      "reward_std": 0.15889404714107513,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 809.28125,
      "completions/mean_terminated_length": 696.8095092773438,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 3.925925925925926,
      "grad_norm": 1.1216442703296912,
      "kl": 0.2850341796875,
      "learning_rate": 5.845318815493069e-08,
      "loss": -0.0258,
      "num_tokens": 37645861.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.21545448899269104,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 800.875,
      "completions/mean_terminated_length": 699.45458984375,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 3.9290123456790123,
      "grad_norm": 0.5340341897893215,
      "kl": 0.2393798828125,
      "learning_rate": 5.813230432757829e-08,
      "loss": 0.0022,
      "num_tokens": 37678037.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 770.8125,
      "completions/mean_terminated_length": 734.6428833007812,
      "completions/min_length": 534.0,
      "completions/min_terminated_length": 534.0,
      "epoch": 3.932098765432099,
      "grad_norm": 1.5345192291964382,
      "kl": 0.22509765625,
      "learning_rate": 5.781218777176744e-08,
      "loss": -0.0874,
      "num_tokens": 37709591.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.12057675421237946,
      "rewards/format_reward_func/mean": 7.450580596923828e-09,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 687.8125,
      "completions/mean_terminated_length": 653.0344848632812,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 3.935185185185185,
      "grad_norm": 1.0116862834248213,
      "kl": 0.26220703125,
      "learning_rate": 5.749283976763186e-08,
      "loss": 0.001,
      "num_tokens": 37737737.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 763.84375,
      "completions/mean_terminated_length": 662.0435180664062,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 3.9382716049382713,
      "grad_norm": 1.5332987303545238,
      "kl": 0.282470703125,
      "learning_rate": 5.717426159223204e-08,
      "loss": -0.0185,
      "num_tokens": 37768828.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.19998514652252197,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 828.40625,
      "completions/mean_terminated_length": 751.8695678710938,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 3.941358024691358,
      "grad_norm": 0.7900048691429388,
      "kl": 0.2333984375,
      "learning_rate": 5.685645451954976e-08,
      "loss": -0.0043,
      "num_tokens": 37802145.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 767.9375,
      "completions/mean_terminated_length": 696.239990234375,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 3.9444444444444446,
      "grad_norm": 0.8397533828484572,
      "kl": 0.259033203125,
      "learning_rate": 5.653941982048333e-08,
      "loss": 0.017,
      "num_tokens": 37833763.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 827.21875,
      "completions/mean_terminated_length": 674.1666870117188,
      "completions/min_length": 377.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 3.947530864197531,
      "grad_norm": 0.6516620994614122,
      "kl": 0.3026123046875,
      "learning_rate": 5.6223158762842336e-08,
      "loss": 0.0037,
      "num_tokens": 37867098.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 930.0,
      "completions/mean_length": 717.375,
      "completions/mean_terminated_length": 673.5714721679688,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 3.950617283950617,
      "grad_norm": 1.5999414612910103,
      "kl": 0.2640380859375,
      "learning_rate": 5.59076726113426e-08,
      "loss": -0.1217,
      "num_tokens": 37896302.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 760.8125,
      "completions/mean_terminated_length": 700.0769653320312,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 3.9537037037037037,
      "grad_norm": 0.7509265961988767,
      "kl": 0.2271728515625,
      "learning_rate": 5.55929626276011e-08,
      "loss": -0.0069,
      "num_tokens": 37926740.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 747.0,
      "completions/mean_terminated_length": 728.5333862304688,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 3.9567901234567904,
      "grad_norm": 1.3573837093169847,
      "kl": 0.25048828125,
      "learning_rate": 5.527903007013099e-08,
      "loss": -0.115,
      "num_tokens": 37957236.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.17631277441978455,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 735.96875,
      "completions/mean_terminated_length": 694.8214721679688,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 3.9598765432098766,
      "grad_norm": 1.833986422518576,
      "kl": 0.2196044921875,
      "learning_rate": 5.4965876194336567e-08,
      "loss": 0.0678,
      "num_tokens": 37987575.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.19079440832138062,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1283
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 759.21875,
      "completions/mean_terminated_length": 698.1154174804688,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 3.962962962962963,
      "grad_norm": 3.1254104994235856,
      "kl": NaN,
      "learning_rate": 5.465350225250801e-08,
      "loss": -0.2184,
      "num_tokens": 38018746.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 760.5,
      "completions/mean_terminated_length": 711.7037353515625,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 3.9660493827160495,
      "grad_norm": 1.2529092562723927,
      "kl": 0.2352294921875,
      "learning_rate": 5.4341909493816786e-08,
      "loss": -0.0364,
      "num_tokens": 38049506.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 734.15625,
      "completions/mean_terminated_length": 692.7500610351562,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 3.9691358024691357,
      "grad_norm": 1.2701229684839075,
      "kl": 0.2274169921875,
      "learning_rate": 5.4031099164310314e-08,
      "loss": -0.0172,
      "num_tokens": 38079387.0,
      "reward": 0.0,
      "reward_std": 0.15204033255577087,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 815.375,
      "completions/mean_terminated_length": 776.74072265625,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 3.9722222222222223,
      "grad_norm": 1.071806597415335,
      "kl": 0.2408447265625,
      "learning_rate": 5.372107250690719e-08,
      "loss": -0.044,
      "num_tokens": 38111955.0,
      "reward": 0.0,
      "reward_std": 0.12547743320465088,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 731.15625,
      "completions/mean_terminated_length": 676.9259033203125,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 3.9753086419753085,
      "grad_norm": 1.513232095260684,
      "kl": 0.2294921875,
      "learning_rate": 5.341183076139219e-08,
      "loss": -0.0682,
      "num_tokens": 38141760.0,
      "reward": 4.656612873077393e-10,
      "reward_std": 0.21574173867702484,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.259629011154175e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 764.40625,
      "completions/mean_terminated_length": 704.5,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 3.978395061728395,
      "grad_norm": 1.1832248599741026,
      "kl": 0.2423095703125,
      "learning_rate": 5.310337516441102e-08,
      "loss": -0.0264,
      "num_tokens": 38172557.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1510920524597168,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 820.125,
      "completions/mean_terminated_length": 740.3478393554688,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 3.9814814814814814,
      "grad_norm": 2.5120349336451113,
      "kl": 0.265869140625,
      "learning_rate": 5.279570694946581e-08,
      "loss": 0.0129,
      "num_tokens": 38205573.0,
      "reward": 0.0,
      "reward_std": 0.23568767309188843,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4751909673213959,
      "step": 1290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 803.625,
      "completions/mean_terminated_length": 717.3912963867188,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 3.984567901234568,
      "grad_norm": 0.038282925972768285,
      "kl": 0.242431640625,
      "learning_rate": 5.2488827346910015e-08,
      "loss": 0.0002,
      "num_tokens": 38238237.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 751.625,
      "completions/mean_terminated_length": 712.7142944335938,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 3.9876543209876543,
      "grad_norm": 0.8511483491169654,
      "kl": 0.2491455078125,
      "learning_rate": 5.21827375839432e-08,
      "loss": -0.0101,
      "num_tokens": 38268473.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 775.625,
      "completions/mean_terminated_length": 706.0799560546875,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 3.9907407407407405,
      "grad_norm": 1.4681370946446408,
      "kl": 0.235107421875,
      "learning_rate": 5.187743888460669e-08,
      "loss": -0.1052,
      "num_tokens": 38299789.0,
      "reward": 0.0,
      "reward_std": 0.15859536826610565,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 804.75,
      "completions/mean_terminated_length": 731.6666870117188,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 3.993827160493827,
      "grad_norm": 1.5640264512679012,
      "kl": 0.2452392578125,
      "learning_rate": 5.15729324697782e-08,
      "loss": -0.1493,
      "num_tokens": 38332157.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.20989200472831726,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 743.96875,
      "completions/mean_terminated_length": 703.9642944335938,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 3.996913580246914,
      "grad_norm": 0.5742256876448175,
      "kl": 0.220458984375,
      "learning_rate": 5.126921955716723e-08,
      "loss": 0.0024,
      "num_tokens": 38362624.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 793.0625,
      "completions/mean_terminated_length": 654.5,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 4.0,
      "grad_norm": 1.823258207693366,
      "kl": 0.24072265625,
      "learning_rate": 5.096630136131e-08,
      "loss": -0.0203,
      "num_tokens": 38395266.0,
      "reward": 0.0,
      "reward_std": 0.15346293151378632,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 803.78125,
      "completions/mean_terminated_length": 752.9615478515625,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 4.003086419753086,
      "grad_norm": 1.3754924643870987,
      "kl": 0.2269287109375,
      "learning_rate": 5.0664179093564765e-08,
      "loss": -0.0257,
      "num_tokens": 38427627.0,
      "reward": 0.0,
      "reward_std": 0.12547743320465088,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 718.0,
      "completions/mean_terminated_length": 674.2857666015625,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 4.006172839506172,
      "grad_norm": 1.1025742548783262,
      "kl": 0.2431640625,
      "learning_rate": 5.036285396210685e-08,
      "loss": 0.014,
      "num_tokens": 38456931.0,
      "reward": 0.0,
      "reward_std": 0.12561336159706116,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 758.78125,
      "completions/mean_terminated_length": 684.5199584960938,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 4.0092592592592595,
      "grad_norm": 1.189512363267725,
      "kl": 0.2606201171875,
      "learning_rate": 5.0062327171923935e-08,
      "loss": 0.0187,
      "num_tokens": 38488392.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1584044247865677,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 726.40625,
      "completions/mean_terminated_length": 683.8928833007812,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 4.012345679012346,
      "grad_norm": 0.7728010763079243,
      "kl": 0.2274169921875,
      "learning_rate": 4.976259992481097e-08,
      "loss": -0.0322,
      "num_tokens": 38517773.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 738.46875,
      "completions/mean_terminated_length": 719.433349609375,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 4.015432098765432,
      "grad_norm": 1.2090473868678877,
      "kl": 0.2313232421875,
      "learning_rate": 4.946367341936578e-08,
      "loss": 0.0106,
      "num_tokens": 38547764.0,
      "reward": 0.0,
      "reward_std": 0.1590990126132965,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 706.625,
      "completions/mean_terminated_length": 661.2857666015625,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 4.018518518518518,
      "grad_norm": 1.1869989440924829,
      "kl": 0.23388671875,
      "learning_rate": 4.916554885098403e-08,
      "loss": 0.0575,
      "num_tokens": 38576708.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 711.9375,
      "completions/mean_terminated_length": 639.923095703125,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 4.021604938271605,
      "grad_norm": 1.4872898806560737,
      "kl": 0.2587890625,
      "learning_rate": 4.8868227411854287e-08,
      "loss": -0.0341,
      "num_tokens": 38606062.0,
      "reward": 0.0,
      "reward_std": 0.14842107892036438,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 757.125,
      "completions/mean_terminated_length": 707.7037353515625,
      "completions/min_length": 539.0,
      "completions/min_terminated_length": 539.0,
      "epoch": 4.0246913580246915,
      "grad_norm": 0.8220477138299612,
      "kl": 0.2186279296875,
      "learning_rate": 4.857171029095364e-08,
      "loss": 0.0149,
      "num_tokens": 38637174.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 764.8125,
      "completions/mean_terminated_length": 738.0,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 4.027777777777778,
      "grad_norm": 1.29623937090978,
      "kl": 0.2288818359375,
      "learning_rate": 4.827599867404261e-08,
      "loss": 0.0527,
      "num_tokens": 38668204.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 739.78125,
      "completions/mean_terminated_length": 699.1785888671875,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 4.030864197530864,
      "grad_norm": 1.4667987300215786,
      "kl": 0.2337646484375,
      "learning_rate": 4.7981093743660634e-08,
      "loss": -0.0212,
      "num_tokens": 38698089.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 774.0,
      "completions/mean_terminated_length": 704.0,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 4.033950617283951,
      "grad_norm": 1.5253578068506375,
      "kl": 0.2095947265625,
      "learning_rate": 4.768699667912118e-08,
      "loss": -0.0004,
      "num_tokens": 38729637.0,
      "reward": 0.0,
      "reward_std": 0.22146491706371307,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096293926239,
      "step": 1307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 736.96875,
      "completions/mean_terminated_length": 656.5999755859375,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 4.037037037037037,
      "grad_norm": 1.1419914492308705,
      "kl": 0.2244873046875,
      "learning_rate": 4.739370865650716e-08,
      "loss": 0.0326,
      "num_tokens": 38759980.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 707.6875,
      "completions/mean_terminated_length": 662.5,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 4.040123456790123,
      "grad_norm": 1.328147668615405,
      "kl": 0.2315673828125,
      "learning_rate": 4.710123084866602e-08,
      "loss": 0.0498,
      "num_tokens": 38788962.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1550375372171402,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 761.375,
      "completions/mean_terminated_length": 673.8333740234375,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 4.04320987654321,
      "grad_norm": 0.5849292838590477,
      "kl": 0.30615234375,
      "learning_rate": 4.6809564425205286e-08,
      "loss": 0.0211,
      "num_tokens": 38819658.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 691.65625,
      "completions/mean_terminated_length": 657.27587890625,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 4.046296296296297,
      "grad_norm": 0.9497969862630996,
      "kl": 0.24951171875,
      "learning_rate": 4.6518710552487796e-08,
      "loss": -0.0088,
      "num_tokens": 38847783.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 821.96875,
      "completions/mean_terminated_length": 742.9130859375,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 4.049382716049383,
      "grad_norm": 1.1436165805403045,
      "kl": 0.2147216796875,
      "learning_rate": 4.6228670393627014e-08,
      "loss": 0.0453,
      "num_tokens": 38881246.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.14513085782527924,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 2.7939677238464355e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 768.25,
      "completions/mean_terminated_length": 731.7142944335938,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 4.052469135802469,
      "grad_norm": 1.2709758034932812,
      "kl": 0.220458984375,
      "learning_rate": 4.5939445108482466e-08,
      "loss": 0.0051,
      "num_tokens": 38912210.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 745.03125,
      "completions/mean_terminated_length": 666.9199829101562,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 4.055555555555555,
      "grad_norm": 1.5778343204658982,
      "kl": 0.238525390625,
      "learning_rate": 4.565103585365479e-08,
      "loss": -0.0128,
      "num_tokens": 38942531.0,
      "reward": 0.0,
      "reward_std": 0.1633128821849823,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 726.75,
      "completions/mean_terminated_length": 671.7037353515625,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 4.058641975308642,
      "grad_norm": 3.2831580240325646,
      "kl": 0.2197265625,
      "learning_rate": 4.536344378248161e-08,
      "loss": -0.0775,
      "num_tokens": 38972527.0,
      "reward": 0.0,
      "reward_std": 0.2469368875026703,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 818.4375,
      "completions/mean_terminated_length": 725.0,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 4.061728395061729,
      "grad_norm": 2.1271171477521342,
      "kl": 0.239013671875,
      "learning_rate": 4.50766700450326e-08,
      "loss": 0.0236,
      "num_tokens": 39005657.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.24546441435813904,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 736.71875,
      "completions/mean_terminated_length": 640.9583740234375,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 4.064814814814815,
      "grad_norm": 1.5065991977028075,
      "kl": 0.235595703125,
      "learning_rate": 4.479071578810481e-08,
      "loss": -0.1461,
      "num_tokens": 39035432.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 833.09375,
      "completions/mean_terminated_length": 779.6399536132812,
      "completions/min_length": 535.0,
      "completions/min_terminated_length": 535.0,
      "epoch": 4.067901234567901,
      "grad_norm": 0.5444820448218113,
      "kl": 0.2464599609375,
      "learning_rate": 4.450558215521838e-08,
      "loss": 0.0067,
      "num_tokens": 39069295.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 753.8125,
      "completions/mean_terminated_length": 703.7777709960938,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 4.070987654320987,
      "grad_norm": 0.043507775135647214,
      "kl": 0.244384765625,
      "learning_rate": 4.4221270286611765e-08,
      "loss": 0.0002,
      "num_tokens": 39099637.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 775.71875,
      "completions/mean_terminated_length": 706.2000122070312,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 4.074074074074074,
      "grad_norm": 1.903184764361162,
      "kl": 0.2518310546875,
      "learning_rate": 4.3937781319237175e-08,
      "loss": -0.113,
      "num_tokens": 39130872.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.26005661487579346,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 2.3283064365386963e-09,
      "rewards/logprob_reward/std": 0.43994131684303284,
      "step": 1320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 725.21875,
      "completions/mean_terminated_length": 669.888916015625,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 4.077160493827161,
      "grad_norm": 0.7411509330381716,
      "kl": 0.2197265625,
      "learning_rate": 4.365511638675612e-08,
      "loss": 0.0307,
      "num_tokens": 39160591.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 775.5,
      "completions/mean_terminated_length": 705.9199829101562,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 4.080246913580247,
      "grad_norm": 0.647895910812952,
      "kl": 0.2244873046875,
      "learning_rate": 4.337327661953477e-08,
      "loss": -0.0036,
      "num_tokens": 39192275.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 694.90625,
      "completions/mean_terminated_length": 633.9629516601562,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 4.083333333333333,
      "grad_norm": 1.7765976153792373,
      "kl": 0.25439453125,
      "learning_rate": 4.3092263144639565e-08,
      "loss": -0.0145,
      "num_tokens": 39220976.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 753.59375,
      "completions/mean_terminated_length": 714.9642944335938,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 4.08641975308642,
      "grad_norm": 0.5632068946424565,
      "kl": 0.2376708984375,
      "learning_rate": 4.281207708583256e-08,
      "loss": 0.0197,
      "num_tokens": 39251451.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 757.59375,
      "completions/mean_terminated_length": 696.1154174804688,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.089506172839506,
      "grad_norm": 0.0074581047025977934,
      "kl": 0.2513427734375,
      "learning_rate": 4.253271956356713e-08,
      "loss": 0.0003,
      "num_tokens": 39282394.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 791.9375,
      "completions/mean_terminated_length": 714.5833740234375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 4.092592592592593,
      "grad_norm": 0.48360248269634865,
      "kl": 0.2557373046875,
      "learning_rate": 4.2254191694983096e-08,
      "loss": -0.0138,
      "num_tokens": 39314036.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 775.5,
      "completions/mean_terminated_length": 692.6666870117188,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 4.095679012345679,
      "grad_norm": 0.8773153501536247,
      "kl": 0.2265625,
      "learning_rate": 4.197649459390287e-08,
      "loss": -0.0089,
      "num_tokens": 39345540.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 886.0,
      "completions/mean_length": 700.09375,
      "completions/mean_terminated_length": 653.8214721679688,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 4.098765432098766,
      "grad_norm": 1.3071656507023186,
      "kl": 0.2476806640625,
      "learning_rate": 4.169962937082635e-08,
      "loss": 0.0354,
      "num_tokens": 39374395.0,
      "reward": 0.0,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 755.8125,
      "completions/mean_terminated_length": 693.923095703125,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 4.101851851851852,
      "grad_norm": 0.012014299408962602,
      "kl": 0.256591796875,
      "learning_rate": 4.142359713292698e-08,
      "loss": 0.0003,
      "num_tokens": 39405145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 759.5,
      "completions/mean_terminated_length": 710.5184936523438,
      "completions/min_length": 563.0,
      "completions/min_terminated_length": 563.0,
      "epoch": 4.104938271604938,
      "grad_norm": 1.1647885790172063,
      "kl": 0.1990966796875,
      "learning_rate": 4.11483989840471e-08,
      "loss": 0.0107,
      "num_tokens": 39435969.0,
      "reward": 0.0,
      "reward_std": 0.1588343232870102,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 732.5625,
      "completions/mean_terminated_length": 690.9285888671875,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.1080246913580245,
      "grad_norm": 0.892157759396175,
      "kl": 0.2264404296875,
      "learning_rate": 4.087403602469347e-08,
      "loss": -0.0048,
      "num_tokens": 39465951.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 770.15625,
      "completions/mean_terminated_length": 753.2333984375,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 4.111111111111111,
      "grad_norm": 1.6380842948757888,
      "kl": 0.2479248046875,
      "learning_rate": 4.060050935203307e-08,
      "loss": -0.0459,
      "num_tokens": 39497596.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.14825105667114258,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 666.28125,
      "completions/mean_terminated_length": 642.433349609375,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 4.114197530864198,
      "grad_norm": 1.6888844533435508,
      "kl": 0.2288818359375,
      "learning_rate": 4.032782005988861e-08,
      "loss": -0.0474,
      "num_tokens": 39524797.0,
      "reward": 0.0,
      "reward_std": 0.15770003199577332,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 804.53125,
      "completions/mean_terminated_length": 718.6521606445312,
      "completions/min_length": 544.0,
      "completions/min_terminated_length": 544.0,
      "epoch": 4.117283950617284,
      "grad_norm": 1.7148153578814338,
      "kl": 0.233642578125,
      "learning_rate": 4.0055969238733945e-08,
      "loss": -0.0327,
      "num_tokens": 39557446.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1082625538110733,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 727.28125,
      "completions/mean_terminated_length": 644.2000122070312,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.12037037037037,
      "grad_norm": 0.568912016435439,
      "kl": 0.2469482421875,
      "learning_rate": 3.978495797569012e-08,
      "loss": 0.0021,
      "num_tokens": 39587207.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 781.0625,
      "completions/mean_terminated_length": 653.8095092773438,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.1234567901234565,
      "grad_norm": 1.2062782284133675,
      "kl": 0.240966796875,
      "learning_rate": 3.95147873545208e-08,
      "loss": -0.0935,
      "num_tokens": 39619137.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 718.96875,
      "completions/mean_terminated_length": 698.6333618164062,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 4.1265432098765435,
      "grad_norm": 0.7993156248367748,
      "kl": 0.2086181640625,
      "learning_rate": 3.924545845562791e-08,
      "loss": 0.0179,
      "num_tokens": 39648652.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 792.96875,
      "completions/mean_terminated_length": 759.9642944335938,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 4.12962962962963,
      "grad_norm": 1.1694157175137634,
      "kl": 0.244384765625,
      "learning_rate": 3.8976972356047325e-08,
      "loss": -0.0599,
      "num_tokens": 39680315.0,
      "reward": 0.0,
      "reward_std": 0.1475234180688858,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 711.75,
      "completions/mean_terminated_length": 653.9259033203125,
      "completions/min_length": 317.0,
      "completions/min_terminated_length": 317.0,
      "epoch": 4.132716049382716,
      "grad_norm": 1.1859909730996183,
      "kl": 0.268798828125,
      "learning_rate": 3.870933012944472e-08,
      "loss": -0.0761,
      "num_tokens": 39709199.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 756.0,
      "completions/mean_terminated_length": 717.7142944335938,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 4.135802469135802,
      "grad_norm": 2.6917961162589608,
      "kl": 0.22900390625,
      "learning_rate": 3.844253284611096e-08,
      "loss": 0.2168,
      "num_tokens": 39740131.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.16228695213794708,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 703.71875,
      "completions/mean_terminated_length": 670.586181640625,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 4.138888888888889,
      "grad_norm": 1.1046492645136858,
      "kl": 0.2418212890625,
      "learning_rate": 3.817658157295819e-08,
      "loss": -0.0054,
      "num_tokens": 39768538.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1644470989704132,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 742.25,
      "completions/mean_terminated_length": 702.0000610351562,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 4.1419753086419755,
      "grad_norm": 1.5064693460733816,
      "kl": 0.22265625,
      "learning_rate": 3.791147737351541e-08,
      "loss": 0.0433,
      "num_tokens": 39798398.0,
      "reward": 0.0,
      "reward_std": 0.2021070271730423,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 829.9375,
      "completions/mean_terminated_length": 697.1578979492188,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 4.145061728395062,
      "grad_norm": 1.1798170534047878,
      "kl": 0.2657470703125,
      "learning_rate": 3.7647221307923946e-08,
      "loss": -0.0032,
      "num_tokens": 39831736.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.10788977891206741,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1343
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 735.96875,
      "completions/mean_terminated_length": 682.629638671875,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.148148148148148,
      "grad_norm": 2.1159050272528184,
      "kl": NaN,
      "learning_rate": 3.738381443293376e-08,
      "loss": -0.1796,
      "num_tokens": 39862139.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15907026827335358,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 743.65625,
      "completions/mean_terminated_length": 691.74072265625,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 4.151234567901234,
      "grad_norm": 1.0283680016874122,
      "kl": 0.2330322265625,
      "learning_rate": 3.7121257801898814e-08,
      "loss": -0.0279,
      "num_tokens": 39891952.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1560322642326355,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 700.5,
      "completions/mean_terminated_length": 678.933349609375,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 4.154320987654321,
      "grad_norm": 1.3339008610165208,
      "kl": 0.2259521484375,
      "learning_rate": 3.685955246477296e-08,
      "loss": 0.0952,
      "num_tokens": 39921040.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.17479786276817322,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 769.4375,
      "completions/mean_terminated_length": 698.1599731445312,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.157407407407407,
      "grad_norm": 0.7137905706762014,
      "kl": 0.2357177734375,
      "learning_rate": 3.659869946810581e-08,
      "loss": 0.0029,
      "num_tokens": 39951866.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 734.0625,
      "completions/mean_terminated_length": 667.1538696289062,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 4.160493827160494,
      "grad_norm": 0.6583705055243982,
      "kl": 0.2454833984375,
      "learning_rate": 3.6338699855038486e-08,
      "loss": 0.0113,
      "num_tokens": 39981844.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 744.40625,
      "completions/mean_terminated_length": 692.629638671875,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 4.16358024691358,
      "grad_norm": 1.1763179529794587,
      "kl": 0.254638671875,
      "learning_rate": 3.6079554665299414e-08,
      "loss": 0.0419,
      "num_tokens": 40012597.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.17487852275371552,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 2.7939677238464355e-09,
      "rewards/logprob_reward/std": 0.4016096293926239,
      "step": 1349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 683.625,
      "completions/mean_terminated_length": 620.5925903320312,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 4.166666666666667,
      "grad_norm": 1.6269839135127173,
      "kl": 0.2401123046875,
      "learning_rate": 3.5821264935200294e-08,
      "loss": 0.0475,
      "num_tokens": 40040325.0,
      "reward": 0.0,
      "reward_std": 0.18347704410552979,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 715.5625,
      "completions/mean_terminated_length": 683.6551513671875,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 4.169753086419753,
      "grad_norm": 1.4784932851557824,
      "kl": 0.2340087890625,
      "learning_rate": 3.5563831697631776e-08,
      "loss": 0.0049,
      "num_tokens": 40069951.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.23774057626724243,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.43994131684303284,
      "step": 1351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 740.5625,
      "completions/mean_terminated_length": 688.0740966796875,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 4.172839506172839,
      "grad_norm": 1.0109079412657824,
      "kl": 0.2315673828125,
      "learning_rate": 3.53072559820595e-08,
      "loss": -0.0407,
      "num_tokens": 40099777.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 846.0,
      "completions/mean_length": 742.15625,
      "completions/mean_terminated_length": 677.1154174804688,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 4.175925925925926,
      "grad_norm": 1.4540323991785824,
      "kl": 0.2296142578125,
      "learning_rate": 3.505153881451997e-08,
      "loss": 0.0178,
      "num_tokens": 40130070.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1511717438697815,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 700.625,
      "completions/mean_terminated_length": 679.0667114257812,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 4.179012345679013,
      "grad_norm": 1.2291250718486564,
      "kl": 0.246337890625,
      "learning_rate": 3.479668121761617e-08,
      "loss": 0.0417,
      "num_tokens": 40159118.0,
      "reward": 0.0,
      "reward_std": 0.16396799683570862,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 759.1875,
      "completions/mean_terminated_length": 698.0769653320312,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 4.182098765432099,
      "grad_norm": 0.6068205422660885,
      "kl": 0.2266845703125,
      "learning_rate": 3.45426842105139e-08,
      "loss": -0.0017,
      "num_tokens": 40189600.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 705.5,
      "completions/mean_terminated_length": 646.5184936523438,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 4.185185185185185,
      "grad_norm": 0.955485888096828,
      "kl": 0.2479248046875,
      "learning_rate": 3.428954880893745e-08,
      "loss": -0.0167,
      "num_tokens": 40218696.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.13241708278656006,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 688.84375,
      "completions/mean_terminated_length": 640.9642944335938,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 4.188271604938271,
      "grad_norm": 1.1845120812115468,
      "kl": 0.22705078125,
      "learning_rate": 3.403727602516554e-08,
      "loss": 0.0325,
      "num_tokens": 40247387.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18656574189662933,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 716.90625,
      "completions/mean_terminated_length": 685.137939453125,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 4.191358024691358,
      "grad_norm": 2.31573766530586,
      "kl": 0.2205810546875,
      "learning_rate": 3.3785866868027426e-08,
      "loss": -0.2404,
      "num_tokens": 40276972.0,
      "reward": -6.51925802230835e-09,
      "reward_std": 0.1844421625137329,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 715.3125,
      "completions/mean_terminated_length": 671.2142944335938,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 4.194444444444445,
      "grad_norm": 0.7058595940206223,
      "kl": 0.2503662109375,
      "learning_rate": 3.353532234289849e-08,
      "loss": -0.0048,
      "num_tokens": 40306290.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 741.75,
      "completions/mean_terminated_length": 712.5516967773438,
      "completions/min_length": 519.0,
      "completions/min_terminated_length": 519.0,
      "epoch": 4.197530864197531,
      "grad_norm": 1.0673894750164643,
      "kl": 0.228271484375,
      "learning_rate": 3.3285643451696796e-08,
      "loss": 0.0088,
      "num_tokens": 40336226.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13493458926677704,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 722.1875,
      "completions/mean_terminated_length": 690.9655151367188,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 4.200617283950617,
      "grad_norm": 0.013044095090708883,
      "kl": 0.260498046875,
      "learning_rate": 3.303683119287859e-08,
      "loss": 0.0003,
      "num_tokens": 40365748.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 747.1875,
      "completions/mean_terminated_length": 695.9259033203125,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 4.203703703703703,
      "grad_norm": 1.3756894297308906,
      "kl": 0.235107421875,
      "learning_rate": 3.278888656143453e-08,
      "loss": -0.0675,
      "num_tokens": 40395882.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 781.15625,
      "completions/mean_terminated_length": 736.1851806640625,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 4.20679012345679,
      "grad_norm": 1.1453659014890265,
      "kl": 0.2275390625,
      "learning_rate": 3.254181054888569e-08,
      "loss": 0.0535,
      "num_tokens": 40427243.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15646272897720337,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 679.125,
      "completions/mean_terminated_length": 629.857177734375,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 4.209876543209877,
      "grad_norm": 1.6602154127880575,
      "kl": 0.287109375,
      "learning_rate": 3.2295604143279534e-08,
      "loss": 0.0005,
      "num_tokens": 40455471.0,
      "reward": 0.0,
      "reward_std": 0.19475162029266357,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 731.5625,
      "completions/mean_terminated_length": 712.0667114257812,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 4.212962962962963,
      "grad_norm": 1.1127604228909358,
      "kl": 0.2222900390625,
      "learning_rate": 3.205026832918606e-08,
      "loss": -0.0076,
      "num_tokens": 40485841.0,
      "reward": 0.0,
      "reward_std": 0.158689484000206,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 688.71875,
      "completions/mean_terminated_length": 666.36669921875,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 4.216049382716049,
      "grad_norm": 0.011130027266053834,
      "kl": 0.2569580078125,
      "learning_rate": 3.1805804087693676e-08,
      "loss": 0.0003,
      "num_tokens": 40513824.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 747.25,
      "completions/mean_terminated_length": 683.3846435546875,
      "completions/min_length": 377.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 4.219135802469136,
      "grad_norm": 1.0897384378388661,
      "kl": 0.259765625,
      "learning_rate": 3.156221239640558e-08,
      "loss": -0.0705,
      "num_tokens": 40544048.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 816.875,
      "completions/mean_terminated_length": 747.8333740234375,
      "completions/min_length": 589.0,
      "completions/min_terminated_length": 589.0,
      "epoch": 4.222222222222222,
      "grad_norm": 3.4160042617833395,
      "kl": 0.2247314453125,
      "learning_rate": 3.13194942294355e-08,
      "loss": 0.0505,
      "num_tokens": 40576872.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 732.3125,
      "completions/mean_terminated_length": 665.0,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 4.2253086419753085,
      "grad_norm": 0.708153099053505,
      "kl": 0.275390625,
      "learning_rate": 3.1077650557404076e-08,
      "loss": 0.0106,
      "num_tokens": 40606486.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 645.8125,
      "completions/mean_terminated_length": 620.6000366210938,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 4.228395061728395,
      "grad_norm": 0.9270864093096222,
      "kl": 0.2149658203125,
      "learning_rate": 3.083668234743489e-08,
      "loss": -0.0417,
      "num_tokens": 40633316.0,
      "reward": 0.0,
      "reward_std": 0.12942197918891907,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 703.25,
      "completions/mean_terminated_length": 681.86669921875,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 4.231481481481482,
      "grad_norm": 0.009636437470312481,
      "kl": 0.236083984375,
      "learning_rate": 3.059659056315053e-08,
      "loss": 0.0002,
      "num_tokens": 40662216.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 684.625,
      "completions/mean_terminated_length": 662.0000610351562,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 4.234567901234568,
      "grad_norm": 0.7075246434007493,
      "kl": 0.27001953125,
      "learning_rate": 3.035737616466885e-08,
      "loss": 0.0019,
      "num_tokens": 40691088.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 777.96875,
      "completions/mean_terminated_length": 732.4074096679688,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 4.237654320987654,
      "grad_norm": 1.5995754242130091,
      "kl": 0.28271484375,
      "learning_rate": 3.0119040108598974e-08,
      "loss": -0.0659,
      "num_tokens": 40722603.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 706.5,
      "completions/mean_terminated_length": 661.1428833007812,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 4.2407407407407405,
      "grad_norm": 0.008518979815666915,
      "kl": 0.2490234375,
      "learning_rate": 2.98815833480377e-08,
      "loss": 0.0002,
      "num_tokens": 40751287.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 767.71875,
      "completions/mean_terminated_length": 731.107177734375,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 4.243827160493828,
      "grad_norm": 0.6234656514821495,
      "kl": 0.2220458984375,
      "learning_rate": 2.964500683256549e-08,
      "loss": -0.0157,
      "num_tokens": 40782634.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 733.15625,
      "completions/mean_terminated_length": 703.0689697265625,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 4.246913580246914,
      "grad_norm": 1.83563101530564,
      "kl": 0.21240234375,
      "learning_rate": 2.9409311508242663e-08,
      "loss": 0.1522,
      "num_tokens": 40812755.0,
      "reward": 0.0,
      "reward_std": 0.14941859245300293,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 693.46875,
      "completions/mean_terminated_length": 659.27587890625,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 4.25,
      "grad_norm": 0.6904988249895223,
      "kl": 0.2445068359375,
      "learning_rate": 2.9174498317605794e-08,
      "loss": 0.0234,
      "num_tokens": 40841254.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 759.1875,
      "completions/mean_terminated_length": 685.0399780273438,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 4.253086419753086,
      "grad_norm": 0.7484817161709968,
      "kl": 0.239013671875,
      "learning_rate": 2.894056819966384e-08,
      "loss": -0.0261,
      "num_tokens": 40872180.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 734.4375,
      "completions/mean_terminated_length": 680.8148193359375,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 4.256172839506172,
      "grad_norm": 2.3139778781618725,
      "kl": 0.235107421875,
      "learning_rate": 2.8707522089894354e-08,
      "loss": -0.0169,
      "num_tokens": 40902214.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.24788418412208557,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 729.90625,
      "completions/mean_terminated_length": 710.300048828125,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 4.2592592592592595,
      "grad_norm": 1.6495025569314476,
      "kl": 0.2481689453125,
      "learning_rate": 2.8475360920239723e-08,
      "loss": -0.0115,
      "num_tokens": 40932035.0,
      "reward": 0.0,
      "reward_std": 0.1961304098367691,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 768.875,
      "completions/mean_terminated_length": 710.0,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 4.262345679012346,
      "grad_norm": 0.45062565058710474,
      "kl": 0.3572998046875,
      "learning_rate": 2.8244085619103546e-08,
      "loss": 0.0004,
      "num_tokens": 40963531.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 750.40625,
      "completions/mean_terminated_length": 673.7999877929688,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 4.265432098765432,
      "grad_norm": 1.5625064773952522,
      "kl": 0.24951171875,
      "learning_rate": 2.8013697111346906e-08,
      "loss": -0.0752,
      "num_tokens": 40994400.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2023996114730835,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 754.53125,
      "completions/mean_terminated_length": 726.6551513671875,
      "completions/min_length": 509.0,
      "completions/min_terminated_length": 509.0,
      "epoch": 4.268518518518518,
      "grad_norm": 1.4024754395459498,
      "kl": 0.2464599609375,
      "learning_rate": 2.778419631828463e-08,
      "loss": -0.0026,
      "num_tokens": 41025157.0,
      "reward": 0.0,
      "reward_std": 0.2116771638393402,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 704.78125,
      "completions/mean_terminated_length": 659.1785888671875,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 4.271604938271605,
      "grad_norm": 0.7706216229059599,
      "kl": 0.2330322265625,
      "learning_rate": 2.755558415768147e-08,
      "loss": 0.0102,
      "num_tokens": 41053978.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 738.46875,
      "completions/mean_terminated_length": 685.5925903320312,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 4.2746913580246915,
      "grad_norm": 1.3983392944478534,
      "kl": 0.24609375,
      "learning_rate": 2.732786154374869e-08,
      "loss": 0.0219,
      "num_tokens": 41083481.0,
      "reward": 0.0,
      "reward_std": 0.17821970582008362,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 774.46875,
      "completions/mean_terminated_length": 691.2916870117188,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 4.277777777777778,
      "grad_norm": 0.8699763418020097,
      "kl": 0.4173583984375,
      "learning_rate": 2.7101029387140318e-08,
      "loss": 0.0004,
      "num_tokens": 41114828.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 700.0,
      "completions/mean_terminated_length": 678.4000244140625,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 4.280864197530864,
      "grad_norm": 1.2608463260542855,
      "kl": 0.248291015625,
      "learning_rate": 2.6875088594949387e-08,
      "loss": 0.034,
      "num_tokens": 41143404.0,
      "reward": 5.587935447692871e-09,
      "reward_std": 0.1777314394712448,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 770.34375,
      "completions/mean_terminated_length": 685.7916870117188,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 4.283950617283951,
      "grad_norm": 1.6406900239739175,
      "kl": 0.227783203125,
      "learning_rate": 2.6650040070704484e-08,
      "loss": -0.0029,
      "num_tokens": 41174483.0,
      "reward": 0.0,
      "reward_std": 0.18626266717910767,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 739.5,
      "completions/mean_terminated_length": 698.857177734375,
      "completions/min_length": 511.0,
      "completions/min_terminated_length": 511.0,
      "epoch": 4.287037037037037,
      "grad_norm": 0.7099944730023894,
      "kl": 0.2410888671875,
      "learning_rate": 2.6425884714365966e-08,
      "loss": -0.0283,
      "num_tokens": 41204375.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 763.125,
      "completions/mean_terminated_length": 690.0799560546875,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.290123456790123,
      "grad_norm": 0.9575861709298303,
      "kl": 0.2384033203125,
      "learning_rate": 2.6202623422322546e-08,
      "loss": -0.0255,
      "num_tokens": 41235415.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 752.6875,
      "completions/mean_terminated_length": 702.4444580078125,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 4.29320987654321,
      "grad_norm": 1.1257990878802664,
      "kl": 0.2274169921875,
      "learning_rate": 2.5980257087387546e-08,
      "loss": 0.0471,
      "num_tokens": 41265769.0,
      "reward": -4.6566128730773926e-09,
      "reward_std": 0.11199356615543365,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 864.0,
      "completions/mean_length": 759.15625,
      "completions/mean_terminated_length": 670.875,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.296296296296296,
      "grad_norm": 1.0121036308932967,
      "kl": 0.2032470703125,
      "learning_rate": 2.5758786598795325e-08,
      "loss": -0.0669,
      "num_tokens": 41296366.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 755.0625,
      "completions/mean_terminated_length": 705.25927734375,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 4.299382716049383,
      "grad_norm": 0.5909457358275607,
      "kl": 0.2374267578125,
      "learning_rate": 2.5538212842197926e-08,
      "loss": -0.0004,
      "num_tokens": 41326864.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 718.40625,
      "completions/mean_terminated_length": 686.7930908203125,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.302469135802469,
      "grad_norm": 1.166366330718843,
      "kl": 0.23663330078125,
      "learning_rate": 2.5318536699661246e-08,
      "loss": -0.0035,
      "num_tokens": 41356101.0,
      "reward": 0.0,
      "reward_std": 0.1670861691236496,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 732.75,
      "completions/mean_terminated_length": 665.5385131835938,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.305555555555555,
      "grad_norm": 1.8184002244796087,
      "kl": 0.230224609375,
      "learning_rate": 2.5099759049661802e-08,
      "loss": -0.0584,
      "num_tokens": 41385837.0,
      "reward": 0.0,
      "reward_std": 0.20034268498420715,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 667.875,
      "completions/mean_terminated_length": 656.3870849609375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 4.308641975308642,
      "grad_norm": 1.46973767164458,
      "kl": 0.2149658203125,
      "learning_rate": 2.4881880767083002e-08,
      "loss": -0.0031,
      "num_tokens": 41413601.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.18115665018558502,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 754.0625,
      "completions/mean_terminated_length": 664.0833740234375,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 4.311728395061729,
      "grad_norm": 1.8273127095817205,
      "kl": 0.276123046875,
      "learning_rate": 2.4664902723211674e-08,
      "loss": -0.02,
      "num_tokens": 41444367.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15263314545154572,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 708.53125,
      "completions/mean_terminated_length": 635.7307739257812,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 4.314814814814815,
      "grad_norm": 1.446883994262247,
      "kl": 0.3037109375,
      "learning_rate": 2.444882578573476e-08,
      "loss": 0.0152,
      "num_tokens": 41473768.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.25062650442123413,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 768.0625,
      "completions/mean_terminated_length": 731.5000610351562,
      "completions/min_length": 587.0,
      "completions/min_terminated_length": 587.0,
      "epoch": 4.317901234567901,
      "grad_norm": 1.0826332800705931,
      "kl": 0.23486328125,
      "learning_rate": 2.4233650818735573e-08,
      "loss": -0.0763,
      "num_tokens": 41505074.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 799.625,
      "completions/mean_terminated_length": 697.6364135742188,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 4.320987654320987,
      "grad_norm": 0.7950440121724911,
      "kl": 0.2513427734375,
      "learning_rate": 2.401937868269058e-08,
      "loss": -0.0097,
      "num_tokens": 41537470.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.055743563920259476,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 818.15625,
      "completions/mean_terminated_length": 724.5909423828125,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 4.324074074074074,
      "grad_norm": 2.0070380457936463,
      "kl": 0.2208251953125,
      "learning_rate": 2.380601023446577e-08,
      "loss": 0.1145,
      "num_tokens": 41570159.0,
      "reward": 0.0,
      "reward_std": 0.3259367048740387,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.5080004930496216,
      "step": 1401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 766.6875,
      "completions/mean_terminated_length": 707.3077392578125,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 4.327160493827161,
      "grad_norm": 1.3185250022760098,
      "kl": 0.240966796875,
      "learning_rate": 2.3593546327313364e-08,
      "loss": -0.0079,
      "num_tokens": 41601021.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.23924091458320618,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 719.9375,
      "completions/mean_terminated_length": 663.629638671875,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 4.330246913580247,
      "grad_norm": 0.0095477643266165,
      "kl": 0.238525390625,
      "learning_rate": 2.338198781086842e-08,
      "loss": 0.0002,
      "num_tokens": 41630547.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 761.78125,
      "completions/mean_terminated_length": 701.269287109375,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 4.333333333333333,
      "grad_norm": 0.7769882393509312,
      "kl": 0.25830078125,
      "learning_rate": 2.317133553114525e-08,
      "loss": -0.0128,
      "num_tokens": 41661440.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 758.75,
      "completions/mean_terminated_length": 709.629638671875,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 4.33641975308642,
      "grad_norm": 1.4821057581030834,
      "kl": 0.2342529296875,
      "learning_rate": 2.2961590330534298e-08,
      "loss": -0.0107,
      "num_tokens": 41692224.0,
      "reward": 0.0,
      "reward_std": 0.22113531827926636,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 766.1875,
      "completions/mean_terminated_length": 718.4444580078125,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 4.339506172839506,
      "grad_norm": 1.3278965900616837,
      "kl": 0.267822265625,
      "learning_rate": 2.2752753047798502e-08,
      "loss": 0.0647,
      "num_tokens": 41722846.0,
      "reward": 0.0,
      "reward_std": 0.15451930463314056,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 744.8125,
      "completions/mean_terminated_length": 704.9285888671875,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 4.342592592592593,
      "grad_norm": 0.7086368539435858,
      "kl": 0.2275390625,
      "learning_rate": 2.2544824518070104e-08,
      "loss": -0.0067,
      "num_tokens": 41752820.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 733.90625,
      "completions/mean_terminated_length": 692.4642944335938,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 4.345679012345679,
      "grad_norm": 1.1133817788663498,
      "kl": 0.2344970703125,
      "learning_rate": 2.2337805572847425e-08,
      "loss": -0.0251,
      "num_tokens": 41782657.0,
      "reward": 0.0,
      "reward_std": 0.17047426104545593,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 704.8125,
      "completions/mean_terminated_length": 683.5333862304688,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 4.348765432098766,
      "grad_norm": 0.6683540846668884,
      "kl": 0.2530517578125,
      "learning_rate": 2.2131697039991127e-08,
      "loss": -0.0125,
      "num_tokens": 41811603.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 725.28125,
      "completions/mean_terminated_length": 682.607177734375,
      "completions/min_length": 301.0,
      "completions/min_terminated_length": 301.0,
      "epoch": 4.351851851851852,
      "grad_norm": 1.2166585199496174,
      "kl": 0.2332763671875,
      "learning_rate": 2.1926499743721405e-08,
      "loss": -0.0369,
      "num_tokens": 41841168.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 829.5,
      "completions/mean_terminated_length": 753.3912963867188,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 4.354938271604938,
      "grad_norm": 2.2801346107215195,
      "kl": 0.28369140625,
      "learning_rate": 2.1722214504614313e-08,
      "loss": 0.1175,
      "num_tokens": 41874748.0,
      "reward": 0.0,
      "reward_std": 0.1789824664592743,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 714.53125,
      "completions/mean_terminated_length": 670.3214721679688,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 4.3580246913580245,
      "grad_norm": 0.803676450488695,
      "kl": 0.2576904296875,
      "learning_rate": 2.1518842139598674e-08,
      "loss": 0.0087,
      "num_tokens": 41903517.0,
      "reward": 0.0,
      "reward_std": 0.13877099752426147,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 759.53125,
      "completions/mean_terminated_length": 685.47998046875,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 4.361111111111111,
      "grad_norm": 0.019441176387390194,
      "kl": 0.2454833984375,
      "learning_rate": 2.1316383461952804e-08,
      "loss": 0.0002,
      "num_tokens": 41934410.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 748.90625,
      "completions/mean_terminated_length": 697.9629516601562,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 4.364197530864198,
      "grad_norm": 1.1760925156406545,
      "kl": 0.2637939453125,
      "learning_rate": 2.1114839281301143e-08,
      "loss": 0.0219,
      "num_tokens": 41965215.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 708.5625,
      "completions/mean_terminated_length": 675.9310302734375,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 4.367283950617284,
      "grad_norm": 0.8958648811218094,
      "kl": 0.3037109375,
      "learning_rate": 2.0914210403611132e-08,
      "loss": 0.0069,
      "num_tokens": 41994477.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 741.4375,
      "completions/mean_terminated_length": 676.2307739257812,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 4.37037037037037,
      "grad_norm": 1.2453417538583529,
      "kl": 0.2357177734375,
      "learning_rate": 2.071449763118993e-08,
      "loss": 0.0133,
      "num_tokens": 42024863.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1714201271533966,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 758.875,
      "completions/mean_terminated_length": 638.3636474609375,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 4.3734567901234565,
      "grad_norm": 0.8876004902813407,
      "kl": 0.281982421875,
      "learning_rate": 2.0515701762681304e-08,
      "loss": -0.0327,
      "num_tokens": 42055639.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.08606424182653427,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 783.625,
      "completions/mean_terminated_length": 689.5652465820312,
      "completions/min_length": 418.0,
      "completions/min_terminated_length": 418.0,
      "epoch": 4.3765432098765435,
      "grad_norm": 0.6408308831338301,
      "kl": 0.2276611328125,
      "learning_rate": 2.0317823593062165e-08,
      "loss": 0.0097,
      "num_tokens": 42087427.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 774.84375,
      "completions/mean_terminated_length": 717.34619140625,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 4.37962962962963,
      "grad_norm": 1.798583489884588,
      "kl": 0.254150390625,
      "learning_rate": 2.0120863913639874e-08,
      "loss": -0.006,
      "num_tokens": 42118910.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 804.15625,
      "completions/mean_terminated_length": 753.423095703125,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 4.382716049382716,
      "grad_norm": 0.016142977629844284,
      "kl": 0.2379150390625,
      "learning_rate": 1.9924823512048438e-08,
      "loss": 0.0002,
      "num_tokens": 42151619.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 780.46875,
      "completions/mean_terminated_length": 724.269287109375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 4.385802469135802,
      "grad_norm": 1.1036477575337424,
      "kl": 0.2420654296875,
      "learning_rate": 1.972970317224601e-08,
      "loss": 0.0052,
      "num_tokens": 42183622.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 769.5625,
      "completions/mean_terminated_length": 710.84619140625,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 4.388888888888889,
      "grad_norm": 1.716196947447889,
      "kl": 0.2664794921875,
      "learning_rate": 1.9535503674511263e-08,
      "loss": -0.0756,
      "num_tokens": 42214680.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.23742079734802246,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096293926239,
      "step": 1422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 696.9375,
      "completions/mean_terminated_length": 675.1333618164062,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 4.3919753086419755,
      "grad_norm": 0.8053514278849082,
      "kl": 0.2352294921875,
      "learning_rate": 1.934222579544059e-08,
      "loss": 0.0094,
      "num_tokens": 42243374.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 873.0,
      "completions/mean_length": 703.3125,
      "completions/mean_terminated_length": 681.933349609375,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 4.395061728395062,
      "grad_norm": 1.5432287425828701,
      "kl": 0.25634765625,
      "learning_rate": 1.9149870307944765e-08,
      "loss": 0.0119,
      "num_tokens": 42272100.0,
      "reward": 0.0,
      "reward_std": 0.1917421817779541,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 757.40625,
      "completions/mean_terminated_length": 719.3214721679688,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 4.398148148148148,
      "grad_norm": 0.007119455157275028,
      "kl": 0.214599609375,
      "learning_rate": 1.895843798124605e-08,
      "loss": 0.0002,
      "num_tokens": 42303153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 740.53125,
      "completions/mean_terminated_length": 700.0357666015625,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 4.401234567901234,
      "grad_norm": 1.3647928972555223,
      "kl": 0.2242431640625,
      "learning_rate": 1.8767929580874863e-08,
      "loss": -0.01,
      "num_tokens": 42333558.0,
      "reward": 0.0,
      "reward_std": 0.21369892358779907,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 749.40625,
      "completions/mean_terminated_length": 686.0385131835938,
      "completions/min_length": 508.0,
      "completions/min_terminated_length": 508.0,
      "epoch": 4.404320987654321,
      "grad_norm": 1.0090806499525684,
      "kl": 0.2540283203125,
      "learning_rate": 1.8578345868666996e-08,
      "loss": 0.0519,
      "num_tokens": 42363883.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 719.53125,
      "completions/mean_terminated_length": 663.1481323242188,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 4.407407407407407,
      "grad_norm": 1.1578184340119995,
      "kl": 0.248046875,
      "learning_rate": 1.8389687602760495e-08,
      "loss": -0.0371,
      "num_tokens": 42393452.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 878.0,
      "completions/mean_length": 711.0625,
      "completions/mean_terminated_length": 653.1111450195312,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 4.410493827160494,
      "grad_norm": 1.2281974268453777,
      "kl": 0.26708984375,
      "learning_rate": 1.820195553759246e-08,
      "loss": 0.0004,
      "num_tokens": 42423002.0,
      "reward": 0.0,
      "reward_std": 0.12636974453926086,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 696.8125,
      "completions/mean_terminated_length": 662.9655151367188,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 4.41358024691358,
      "grad_norm": 1.0386678021098472,
      "kl": 0.2296142578125,
      "learning_rate": 1.8015150423896203e-08,
      "loss": 0.0068,
      "num_tokens": 42451684.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13909262418746948,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 751.9375,
      "completions/mean_terminated_length": 713.0714721679688,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 4.416666666666667,
      "grad_norm": 1.7160511739689148,
      "kl": 0.260498046875,
      "learning_rate": 1.782927300869827e-08,
      "loss": 0.0207,
      "num_tokens": 42482794.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2251279354095459,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 723.6875,
      "completions/mean_terminated_length": 692.6206665039062,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 4.419753086419753,
      "grad_norm": 0.009155227082101668,
      "kl": 0.2347412109375,
      "learning_rate": 1.7644324035315212e-08,
      "loss": 0.0002,
      "num_tokens": 42512348.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 764.96875,
      "completions/mean_terminated_length": 717.0,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 4.422839506172839,
      "grad_norm": 0.7984414048470345,
      "kl": 0.2386474609375,
      "learning_rate": 1.746030424335093e-08,
      "loss": 0.0002,
      "num_tokens": 42543131.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 735.5625,
      "completions/mean_terminated_length": 682.1481323242188,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 4.425925925925926,
      "grad_norm": 1.7422509028954587,
      "kl": 0.248291015625,
      "learning_rate": 1.7277214368693423e-08,
      "loss": -0.0442,
      "num_tokens": 42572933.0,
      "reward": 0.0,
      "reward_std": 0.22261665761470795,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 816.96875,
      "completions/mean_terminated_length": 735.95654296875,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 4.429012345679013,
      "grad_norm": 1.2367044502059985,
      "kl": 0.199951171875,
      "learning_rate": 1.7095055143512117e-08,
      "loss": 0.011,
      "num_tokens": 42606156.0,
      "reward": 0.0,
      "reward_std": 0.15895044803619385,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1435
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 750.15625,
      "completions/mean_terminated_length": 686.9615478515625,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 4.432098765432099,
      "grad_norm": 1.432721740278881,
      "kl": NaN,
      "learning_rate": 1.6913827296254736e-08,
      "loss": 0.0051,
      "num_tokens": 42636689.0,
      "reward": 0.0,
      "reward_std": 0.21788108348846436,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 772.28125,
      "completions/mean_terminated_length": 688.375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 4.435185185185185,
      "grad_norm": 1.2139756708776193,
      "kl": 0.28173828125,
      "learning_rate": 1.6733531551644503e-08,
      "loss": -0.0349,
      "num_tokens": 42668014.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 970.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 702.78125,
      "completions/mean_terminated_length": 702.78125,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 4.438271604938271,
      "grad_norm": 1.286333529259584,
      "kl": 0.2662353515625,
      "learning_rate": 1.655416863067713e-08,
      "loss": -0.0028,
      "num_tokens": 42696635.0,
      "reward": 0.0,
      "reward_std": 0.12554804980754852,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 824.0,
      "completions/mean_length": 724.0,
      "completions/mean_terminated_length": 654.7692260742188,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 4.441358024691358,
      "grad_norm": 1.1355877171411868,
      "kl": 0.26513671875,
      "learning_rate": 1.637573925061808e-08,
      "loss": -0.062,
      "num_tokens": 42726715.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 806.0625,
      "completions/mean_terminated_length": 733.4166870117188,
      "completions/min_length": 532.0,
      "completions/min_terminated_length": 532.0,
      "epoch": 4.444444444444445,
      "grad_norm": 0.7168739194832622,
      "kl": 0.2152099609375,
      "learning_rate": 1.6198244124999592e-08,
      "loss": 0.0137,
      "num_tokens": 42759561.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 708.875,
      "completions/mean_terminated_length": 687.86669921875,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 4.447530864197531,
      "grad_norm": 1.704715348627811,
      "kl": 0.251220703125,
      "learning_rate": 1.6021683963617805e-08,
      "loss": -0.1778,
      "num_tokens": 42788609.0,
      "reward": -1.3969838619232178e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 947.0,
      "completions/mean_length": 659.03125,
      "completions/mean_terminated_length": 634.7000122070312,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 4.450617283950617,
      "grad_norm": 0.9700046429366972,
      "kl": 0.256591796875,
      "learning_rate": 1.5846059472530122e-08,
      "loss": 0.0444,
      "num_tokens": 42815726.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 716.53125,
      "completions/mean_terminated_length": 672.607177734375,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 4.453703703703704,
      "grad_norm": 1.3777724970846128,
      "kl": 0.2747802734375,
      "learning_rate": 1.5671371354051997e-08,
      "loss": 0.0073,
      "num_tokens": 42845727.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.17593824863433838,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 774.3125,
      "completions/mean_terminated_length": 691.0833740234375,
      "completions/min_length": 380.0,
      "completions/min_terminated_length": 380.0,
      "epoch": 4.45679012345679,
      "grad_norm": 1.4633514790213318,
      "kl": 0.2613525390625,
      "learning_rate": 1.5497620306754582e-08,
      "loss": -0.0771,
      "num_tokens": 42876685.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1540786176919937,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 732.59375,
      "completions/mean_terminated_length": 665.34619140625,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 4.459876543209877,
      "grad_norm": 1.1255900694054242,
      "kl": 0.2542724609375,
      "learning_rate": 1.5324807025461656e-08,
      "loss": -0.0055,
      "num_tokens": 42906312.0,
      "reward": 0.0,
      "reward_std": 0.12971608340740204,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 712.5625,
      "completions/mean_terminated_length": 680.3448486328125,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 4.462962962962963,
      "grad_norm": 1.1792254002084794,
      "kl": 0.2718505859375,
      "learning_rate": 1.515293220124683e-08,
      "loss": 0.0162,
      "num_tokens": 42935194.0,
      "reward": 0.0,
      "reward_std": 0.19811780750751495,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 708.9375,
      "completions/mean_terminated_length": 650.5925903320312,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 4.466049382716049,
      "grad_norm": 1.5162448328253944,
      "kl": 0.269287109375,
      "learning_rate": 1.498199652143092e-08,
      "loss": -0.0355,
      "num_tokens": 42964260.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2714763879776001,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -5.587935447692871e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 765.5,
      "completions/mean_terminated_length": 693.1199951171875,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 4.469135802469136,
      "grad_norm": 0.5510125170453003,
      "kl": 0.2532958984375,
      "learning_rate": 1.4812000669579188e-08,
      "loss": 0.0022,
      "num_tokens": 42995064.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 717.3125,
      "completions/mean_terminated_length": 685.586181640625,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 4.472222222222222,
      "grad_norm": 0.9754741669344269,
      "kl": 0.2490234375,
      "learning_rate": 1.4642945325498507e-08,
      "loss": -0.0105,
      "num_tokens": 43024414.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 779.78125,
      "completions/mean_terminated_length": 698.375,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 4.4753086419753085,
      "grad_norm": 1.1351133490202836,
      "kl": 0.2525634765625,
      "learning_rate": 1.4474831165234707e-08,
      "loss": -0.0089,
      "num_tokens": 43056151.0,
      "reward": 0.0,
      "reward_std": 0.13225091993808746,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 708.0625,
      "completions/mean_terminated_length": 675.3793334960938,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 4.478395061728395,
      "grad_norm": 1.11370904437112,
      "kl": 0.2401123046875,
      "learning_rate": 1.4307658861069799e-08,
      "loss": 0.0537,
      "num_tokens": 43085493.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.15649469196796417,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 722.0625,
      "completions/mean_terminated_length": 690.8275756835938,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 4.481481481481482,
      "grad_norm": 1.0077098916504739,
      "kl": 0.235595703125,
      "learning_rate": 1.414142908151944e-08,
      "loss": 0.0711,
      "num_tokens": 43114755.0,
      "reward": 0.0,
      "reward_std": 0.1258188784122467,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 799.875,
      "completions/mean_terminated_length": 737.1199951171875,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 4.484567901234568,
      "grad_norm": 1.7503621536411276,
      "kl": 0.2568359375,
      "learning_rate": 1.3976142491330111e-08,
      "loss": -0.0195,
      "num_tokens": 43147339.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15790243446826935,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 761.28125,
      "completions/mean_terminated_length": 700.6538696289062,
      "completions/min_length": 524.0,
      "completions/min_terminated_length": 524.0,
      "epoch": 4.487654320987654,
      "grad_norm": 0.854007605780328,
      "kl": 0.268310546875,
      "learning_rate": 1.3811799751476588e-08,
      "loss": -0.0167,
      "num_tokens": 43178396.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 778.0,
      "completions/mean_terminated_length": 732.4444580078125,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 4.4907407407407405,
      "grad_norm": 1.2822914755192405,
      "kl": 0.242919921875,
      "learning_rate": 1.3648401519159109e-08,
      "loss": 0.0596,
      "num_tokens": 43209752.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15858054161071777,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 761.90625,
      "completions/mean_terminated_length": 724.4642944335938,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 4.493827160493828,
      "grad_norm": 3.16452500563885,
      "kl": 0.2330322265625,
      "learning_rate": 1.348594844780096e-08,
      "loss": -0.2042,
      "num_tokens": 43240645.0,
      "reward": 0.0,
      "reward_std": 0.20749691128730774,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 751.46875,
      "completions/mean_terminated_length": 675.1599731445312,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 4.496913580246914,
      "grad_norm": 0.697725287440438,
      "kl": 0.22607421875,
      "learning_rate": 1.332444118704576e-08,
      "loss": -0.0031,
      "num_tokens": 43271192.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 825.9375,
      "completions/mean_terminated_length": 735.9091186523438,
      "completions/min_length": 519.0,
      "completions/min_terminated_length": 519.0,
      "epoch": 4.5,
      "grad_norm": 1.9629974354596997,
      "kl": 0.257568359375,
      "learning_rate": 1.3163880382754761e-08,
      "loss": -0.0031,
      "num_tokens": 43305026.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.19411087036132812,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 717.71875,
      "completions/mean_terminated_length": 673.9642944335938,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 4.503086419753086,
      "grad_norm": 0.5816736542401625,
      "kl": 0.2447509765625,
      "learning_rate": 1.3004266677004522e-08,
      "loss": 0.0211,
      "num_tokens": 43334657.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 714.59375,
      "completions/mean_terminated_length": 682.586181640625,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 4.506172839506172,
      "grad_norm": 1.6013058739639459,
      "kl": 0.274169921875,
      "learning_rate": 1.2845600708084076e-08,
      "loss": -0.0656,
      "num_tokens": 43363748.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.08606424182653427,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 763.6875,
      "completions/mean_terminated_length": 736.7586059570312,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 4.5092592592592595,
      "grad_norm": 0.9619915024676163,
      "kl": 0.225830078125,
      "learning_rate": 1.2687883110492515e-08,
      "loss": 0.0052,
      "num_tokens": 43395030.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 846.0,
      "completions/mean_length": 660.09375,
      "completions/mean_terminated_length": 592.7037353515625,
      "completions/min_length": 380.0,
      "completions/min_terminated_length": 380.0,
      "epoch": 4.512345679012346,
      "grad_norm": 0.914771333212492,
      "kl": 0.2432861328125,
      "learning_rate": 1.2531114514936491e-08,
      "loss": -0.0298,
      "num_tokens": 43422197.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 710.15625,
      "completions/mean_terminated_length": 689.2333984375,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 4.515432098765432,
      "grad_norm": 0.008567160631542564,
      "kl": 0.2672119140625,
      "learning_rate": 1.2375295548327557e-08,
      "loss": 0.0003,
      "num_tokens": 43450938.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 773.28125,
      "completions/mean_terminated_length": 689.7083740234375,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 4.518518518518518,
      "grad_norm": 0.6652972884100276,
      "kl": 0.234375,
      "learning_rate": 1.222042683377983e-08,
      "loss": 0.0102,
      "num_tokens": 43481959.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 747.5,
      "completions/mean_terminated_length": 670.0799560546875,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.521604938271605,
      "grad_norm": 1.3273276748538272,
      "kl": 0.213134765625,
      "learning_rate": 1.2066508990607293e-08,
      "loss": -0.0103,
      "num_tokens": 43512859.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.17703334987163544,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 782.53125,
      "completions/mean_terminated_length": 726.8077392578125,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 4.5246913580246915,
      "grad_norm": 0.9467423673398551,
      "kl": 0.244384765625,
      "learning_rate": 1.1913542634321538e-08,
      "loss": -0.0211,
      "num_tokens": 43544524.0,
      "reward": 0.0,
      "reward_std": 0.12943127751350403,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 769.0,
      "completions/mean_terminated_length": 710.1538696289062,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 4.527777777777778,
      "grad_norm": 0.6749859640093453,
      "kl": 0.2437744140625,
      "learning_rate": 1.1761528376629137e-08,
      "loss": -0.0008,
      "num_tokens": 43575480.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 741.75,
      "completions/mean_terminated_length": 701.4285888671875,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 4.530864197530864,
      "grad_norm": 0.008226298242311067,
      "kl": 0.2564697265625,
      "learning_rate": 1.1610466825429182e-08,
      "loss": 0.0003,
      "num_tokens": 43605616.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 773.625,
      "completions/mean_terminated_length": 703.5199584960938,
      "completions/min_length": 499.0,
      "completions/min_terminated_length": 499.0,
      "epoch": 4.533950617283951,
      "grad_norm": 1.3454237770622883,
      "kl": 0.2467041015625,
      "learning_rate": 1.1460358584811091e-08,
      "loss": -0.0142,
      "num_tokens": 43637272.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 793.59375,
      "completions/mean_terminated_length": 740.423095703125,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 4.537037037037037,
      "grad_norm": 1.2266512251474808,
      "kl": 0.216796875,
      "learning_rate": 1.1311204255051942e-08,
      "loss": -0.0371,
      "num_tokens": 43668955.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15522989630699158,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 873.0,
      "completions/mean_length": 765.4375,
      "completions/mean_terminated_length": 705.769287109375,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 4.540123456790123,
      "grad_norm": 1.7762198394012427,
      "kl": 0.2493896484375,
      "learning_rate": 1.116300443261417e-08,
      "loss": -0.0782,
      "num_tokens": 43700409.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15190282464027405,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 736.6875,
      "completions/mean_terminated_length": 683.4815063476562,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 4.54320987654321,
      "grad_norm": 0.01160143827927758,
      "kl": 0.2462158203125,
      "learning_rate": 1.1015759710143124e-08,
      "loss": 0.0002,
      "num_tokens": 43730175.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 735.0625,
      "completions/mean_terminated_length": 693.7857666015625,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.546296296296296,
      "grad_norm": 1.1958380704020215,
      "kl": 0.258056640625,
      "learning_rate": 1.0869470676464848e-08,
      "loss": 0.0092,
      "num_tokens": 43759785.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 877.0,
      "completions/mean_length": 784.21875,
      "completions/mean_terminated_length": 690.3912963867188,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 4.549382716049383,
      "grad_norm": 0.008103284846342927,
      "kl": 0.241455078125,
      "learning_rate": 1.0724137916583525e-08,
      "loss": 0.0002,
      "num_tokens": 43791444.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 847.625,
      "completions/mean_terminated_length": 767.45458984375,
      "completions/min_length": 583.0,
      "completions/min_terminated_length": 583.0,
      "epoch": 4.552469135802469,
      "grad_norm": 1.0452344389501882,
      "kl": 0.244384765625,
      "learning_rate": 1.0579762011679317e-08,
      "loss": 0.0184,
      "num_tokens": 43825792.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 750.875,
      "completions/mean_terminated_length": 700.2963256835938,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 4.555555555555555,
      "grad_norm": 1.623900247495174,
      "kl": 0.250244140625,
      "learning_rate": 1.0436343539105857e-08,
      "loss": -0.0896,
      "num_tokens": 43856188.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.25143033266067505,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 820.4375,
      "completions/mean_terminated_length": 763.4400024414062,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 4.5586419753086425,
      "grad_norm": 1.1065812722542763,
      "kl": 0.262451171875,
      "learning_rate": 1.0293883072388154e-08,
      "loss": 0.0049,
      "num_tokens": 43888986.0,
      "reward": 0.0,
      "reward_std": 0.14708144962787628,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 703.96875,
      "completions/mean_terminated_length": 644.7037353515625,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 4.561728395061729,
      "grad_norm": 1.5148359069841493,
      "kl": 0.2587890625,
      "learning_rate": 1.015238118122011e-08,
      "loss": -0.0587,
      "num_tokens": 43917749.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.16143634915351868,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 826.78125,
      "completions/mean_terminated_length": 771.5599975585938,
      "completions/min_length": 596.0,
      "completions/min_terminated_length": 596.0,
      "epoch": 4.564814814814815,
      "grad_norm": 0.7943240804840882,
      "kl": 0.217529296875,
      "learning_rate": 1.0011838431462389e-08,
      "loss": -0.0164,
      "num_tokens": 43950926.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 754.25,
      "completions/mean_terminated_length": 692.0,
      "completions/min_length": 333.0,
      "completions/min_terminated_length": 333.0,
      "epoch": 4.567901234567901,
      "grad_norm": 0.9560197383091844,
      "kl": 0.2296142578125,
      "learning_rate": 9.872255385140027e-09,
      "loss": -0.014,
      "num_tokens": 43981338.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 756.15625,
      "completions/mean_terminated_length": 666.875,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 4.570987654320987,
      "grad_norm": 0.9141392876347829,
      "kl": 0.24658203125,
      "learning_rate": 9.733632600440245e-09,
      "loss": 0.0028,
      "num_tokens": 44011979.0,
      "reward": 0.02812499739229679,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 780.21875,
      "completions/mean_terminated_length": 711.9599609375,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 4.574074074074074,
      "grad_norm": 1.618630101173056,
      "kl": 0.23486328125,
      "learning_rate": 9.595970631710248e-09,
      "loss": 0.0286,
      "num_tokens": 44043642.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.2312345653772354,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 831.0,
      "completions/mean_length": 656.90625,
      "completions/mean_terminated_length": 632.433349609375,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 4.577160493827161,
      "grad_norm": 1.6753239581135873,
      "kl": 0.28173828125,
      "learning_rate": 9.459270029454986e-09,
      "loss": -0.0866,
      "num_tokens": 44070927.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 749.75,
      "completions/mean_terminated_length": 698.9629516601562,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 4.580246913580247,
      "grad_norm": 1.3529637418083982,
      "kl": 0.2352294921875,
      "learning_rate": 9.323531340334868e-09,
      "loss": -0.0959,
      "num_tokens": 44101015.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.2225850522518158,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 763.46875,
      "completions/mean_terminated_length": 715.2222290039062,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 4.583333333333333,
      "grad_norm": 1.2894571737097351,
      "kl": 0.27978515625,
      "learning_rate": 9.188755107163743e-09,
      "loss": -0.0067,
      "num_tokens": 44131950.0,
      "reward": 0.0,
      "reward_std": 0.15898141264915466,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 769.21875,
      "completions/mean_terminated_length": 697.8800048828125,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 4.58641975308642,
      "grad_norm": 2.0465573166944764,
      "kl": 0.2640380859375,
      "learning_rate": 9.054941868906513e-09,
      "loss": 0.0193,
      "num_tokens": 44163125.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.19464506208896637,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 819.25,
      "completions/mean_terminated_length": 772.0,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 4.589506172839506,
      "grad_norm": 0.6598867747085388,
      "kl": 0.2125244140625,
      "learning_rate": 8.922092160677242e-09,
      "loss": 0.0094,
      "num_tokens": 44196009.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 732.125,
      "completions/mean_terminated_length": 678.0740966796875,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 4.592592592592593,
      "grad_norm": 1.0613875861344555,
      "kl": 0.248779296875,
      "learning_rate": 8.79020651373677e-09,
      "loss": -0.0235,
      "num_tokens": 44225837.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 811.125,
      "completions/mean_terminated_length": 740.1666870117188,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 4.595679012345679,
      "grad_norm": 1.0975005863045624,
      "kl": 0.2247314453125,
      "learning_rate": 8.659285455490745e-09,
      "loss": 0.0161,
      "num_tokens": 44258869.0,
      "reward": -2.7939677238464355e-09,
      "reward_std": 0.12777692079544067,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 836.75,
      "completions/mean_terminated_length": 784.3200073242188,
      "completions/min_length": 534.0,
      "completions/min_terminated_length": 534.0,
      "epoch": 4.598765432098766,
      "grad_norm": 1.1861886423089967,
      "kl": 0.2210693359375,
      "learning_rate": 8.529329509487455e-09,
      "loss": -0.0573,
      "num_tokens": 44292277.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 745.03125,
      "completions/mean_terminated_length": 716.1724243164062,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 4.601851851851852,
      "grad_norm": 0.7326756903585625,
      "kl": 0.25634765625,
      "learning_rate": 8.400339195415718e-09,
      "loss": 0.0043,
      "num_tokens": 44322610.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 774.5625,
      "completions/mean_terminated_length": 704.719970703125,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 4.604938271604938,
      "grad_norm": 1.3326776015722304,
      "kl": 0.2071533203125,
      "learning_rate": 8.272315029102888e-09,
      "loss": 0.0309,
      "num_tokens": 44354236.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 715.5625,
      "completions/mean_terminated_length": 683.6551513671875,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 4.6080246913580245,
      "grad_norm": 0.7337581308267972,
      "kl": 0.263427734375,
      "learning_rate": 8.145257522512606e-09,
      "loss": -0.0121,
      "num_tokens": 44383478.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 740.6875,
      "completions/mean_terminated_length": 675.3077392578125,
      "completions/min_length": 540.0,
      "completions/min_terminated_length": 540.0,
      "epoch": 4.611111111111111,
      "grad_norm": 1.4010798081409779,
      "kl": 0.2664794921875,
      "learning_rate": 8.019167183743041e-09,
      "loss": -0.0769,
      "num_tokens": 44413624.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.23660650849342346,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 684.5,
      "completions/mean_terminated_length": 649.3793334960938,
      "completions/min_length": 380.0,
      "completions/min_terminated_length": 380.0,
      "epoch": 4.614197530864198,
      "grad_norm": 1.336641774866964,
      "kl": 0.26806640625,
      "learning_rate": 7.89404451702455e-09,
      "loss": -0.0713,
      "num_tokens": 44441684.0,
      "reward": 0.028124995529651642,
      "reward_std": 0.09217105805873871,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 705.71875,
      "completions/mean_terminated_length": 672.7930908203125,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 4.617283950617284,
      "grad_norm": 1.043543092730768,
      "kl": 0.27276611328125,
      "learning_rate": 7.769890022717884e-09,
      "loss": 0.0438,
      "num_tokens": 44470839.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 763.625,
      "completions/mean_terminated_length": 715.4074096679688,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 4.62037037037037,
      "grad_norm": 1.4209127364921927,
      "kl": 0.21923828125,
      "learning_rate": 7.646704197312143e-09,
      "loss": 0.0321,
      "num_tokens": 44502179.0,
      "reward": 0.0,
      "reward_std": 0.14213800430297852,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 724.28125,
      "completions/mean_terminated_length": 693.27587890625,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 4.6234567901234565,
      "grad_norm": 0.5440669829807854,
      "kl": 0.2264404296875,
      "learning_rate": 7.524487533422635e-09,
      "loss": 0.0291,
      "num_tokens": 44532124.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 705.59375,
      "completions/mean_terminated_length": 672.6551513671875,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 4.6265432098765435,
      "grad_norm": 1.8505941992881312,
      "kl": 0.24951171875,
      "learning_rate": 7.403240519789161e-09,
      "loss": 0.0937,
      "num_tokens": 44561379.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.17386360466480255,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 716.40625,
      "completions/mean_terminated_length": 706.4838256835938,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 4.62962962962963,
      "grad_norm": 1.0809505779188693,
      "kl": 0.2236328125,
      "learning_rate": 7.282963641273842e-09,
      "loss": 0.0114,
      "num_tokens": 44590408.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 777.1875,
      "completions/mean_terminated_length": 694.9166870117188,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 4.632716049382716,
      "grad_norm": 1.2386119792959611,
      "kl": 0.2620849609375,
      "learning_rate": 7.163657378859267e-09,
      "loss": -0.0661,
      "num_tokens": 44622134.0,
      "reward": 0.0,
      "reward_std": 0.14349564909934998,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 762.09375,
      "completions/mean_terminated_length": 713.5925903320312,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 4.635802469135802,
      "grad_norm": 1.603004335218694,
      "kl": 0.2332763671875,
      "learning_rate": 7.045322209646654e-09,
      "loss": 0.0418,
      "num_tokens": 44652609.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15408048033714294,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 681.78125,
      "completions/mean_terminated_length": 670.741943359375,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 4.638888888888889,
      "grad_norm": 1.3373995959380525,
      "kl": 0.25177001953125,
      "learning_rate": 6.927958606853746e-09,
      "loss": -0.0089,
      "num_tokens": 44680562.0,
      "reward": 0.0,
      "reward_std": 0.159096360206604,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 687.5,
      "completions/mean_terminated_length": 676.6451416015625,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 4.6419753086419755,
      "grad_norm": 1.2645721070428075,
      "kl": 0.2462158203125,
      "learning_rate": 6.811567039813087e-09,
      "loss": 0.0361,
      "num_tokens": 44708718.0,
      "reward": 0.0,
      "reward_std": 0.12781395018100739,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 759.75,
      "completions/mean_terminated_length": 722.0000610351562,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.645061728395062,
      "grad_norm": 0.7789487740363542,
      "kl": 0.2628173828125,
      "learning_rate": 6.696147973970112e-09,
      "loss": -0.0186,
      "num_tokens": 44739126.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 757.5625,
      "completions/mean_terminated_length": 653.3043823242188,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 4.648148148148148,
      "grad_norm": 1.4287106138991188,
      "kl": 0.2666015625,
      "learning_rate": 6.581701870881196e-09,
      "loss": 0.0116,
      "num_tokens": 44769684.0,
      "reward": 0.0,
      "reward_std": 0.16816477477550507,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 697.46875,
      "completions/mean_terminated_length": 637.0,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "epoch": 4.651234567901234,
      "grad_norm": 0.01064046985645947,
      "kl": 0.22314453125,
      "learning_rate": 6.4682291882119375e-09,
      "loss": 0.0002,
      "num_tokens": 44798619.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 749.15625,
      "completions/mean_terminated_length": 685.7307739257812,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 4.654320987654321,
      "grad_norm": 1.4356134177204276,
      "kl": 0.2442626953125,
      "learning_rate": 6.355730379735219e-09,
      "loss": -0.0287,
      "num_tokens": 44828888.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1577756106853485,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 747.5,
      "completions/mean_terminated_length": 708.0000610351562,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 4.657407407407407,
      "grad_norm": 1.2655852724598151,
      "kl": 0.2147216796875,
      "learning_rate": 6.244205895329452e-09,
      "loss": -0.0887,
      "num_tokens": 44859128.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 757.15625,
      "completions/mean_terminated_length": 682.4400024414062,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.660493827160494,
      "grad_norm": 0.007769112039093948,
      "kl": 0.218505859375,
      "learning_rate": 6.133656180976776e-09,
      "loss": 0.0002,
      "num_tokens": 44889905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 753.375,
      "completions/mean_terminated_length": 663.1666870117188,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 4.66358024691358,
      "grad_norm": 1.7984021331093254,
      "kl": 0.27685546875,
      "learning_rate": 6.024081678761228e-09,
      "loss": -0.0534,
      "num_tokens": 44920517.0,
      "reward": 0.0,
      "reward_std": 0.21526148915290833,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 7.450580596923828e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 741.9375,
      "completions/mean_terminated_length": 701.6428833007812,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 4.666666666666667,
      "grad_norm": 0.6358532654376484,
      "kl": 0.2335205078125,
      "learning_rate": 5.915482826867047e-09,
      "loss": -0.0241,
      "num_tokens": 44951011.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 910.0,
      "completions/mean_length": 692.25,
      "completions/mean_terminated_length": 630.8148193359375,
      "completions/min_length": 377.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 4.669753086419753,
      "grad_norm": 0.7904306048798535,
      "kl": 0.259521484375,
      "learning_rate": 5.807860059576841e-09,
      "loss": 0.0035,
      "num_tokens": 44979279.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 734.59375,
      "completions/mean_terminated_length": 681.0,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 4.672839506172839,
      "grad_norm": 0.9593295827412627,
      "kl": 0.2333984375,
      "learning_rate": 5.701213807269956e-09,
      "loss": 0.037,
      "num_tokens": 45009030.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.15900850296020508,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 749.53125,
      "completions/mean_terminated_length": 658.0416870117188,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 4.675925925925926,
      "grad_norm": 0.8617107703273437,
      "kl": 0.249267578125,
      "learning_rate": 5.5955444964206345e-09,
      "loss": -0.0191,
      "num_tokens": 45039079.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 776.40625,
      "completions/mean_terminated_length": 693.875,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 4.679012345679013,
      "grad_norm": 0.8997391467082752,
      "kl": 0.2471923828125,
      "learning_rate": 5.490852549596387e-09,
      "loss": 0.0058,
      "num_tokens": 45070608.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 719.75,
      "completions/mean_terminated_length": 676.2857666015625,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 4.682098765432099,
      "grad_norm": 0.01289473649206381,
      "kl": 0.2464599609375,
      "learning_rate": 5.387138385456319e-09,
      "loss": 0.0002,
      "num_tokens": 45099740.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 801.46875,
      "completions/mean_terminated_length": 700.3181762695312,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 4.685185185185185,
      "grad_norm": 1.4600700216093516,
      "kl": 0.2261962890625,
      "learning_rate": 5.284402418749362e-09,
      "loss": -0.0393,
      "num_tokens": 45132163.0,
      "reward": 0.0,
      "reward_std": 0.141945943236351,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1518
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 752.28125,
      "completions/mean_terminated_length": 734.1666870117188,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 4.688271604938271,
      "grad_norm": 1.2493370767929117,
      "kl": NaN,
      "learning_rate": 5.182645060312685e-09,
      "loss": 0.0287,
      "num_tokens": 45162660.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.15481436252593994,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 687.875,
      "completions/mean_terminated_length": 639.857177734375,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 4.6913580246913575,
      "grad_norm": 1.0526833986952737,
      "kl": 0.2335205078125,
      "learning_rate": 5.081866717070088e-09,
      "loss": 0.0252,
      "num_tokens": 45191200.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 746.46875,
      "completions/mean_terminated_length": 668.760009765625,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.694444444444445,
      "grad_norm": 0.8318476170545551,
      "kl": 0.2430419921875,
      "learning_rate": 4.9820677920302534e-09,
      "loss": -0.0205,
      "num_tokens": 45221611.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 710.28125,
      "completions/mean_terminated_length": 677.8275756835938,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 4.697530864197531,
      "grad_norm": 3.9223276908849445,
      "kl": 0.2493896484375,
      "learning_rate": 4.883248684285302e-09,
      "loss": -0.2162,
      "num_tokens": 45250884.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 779.78125,
      "completions/mean_terminated_length": 744.8928833007812,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 4.700617283950617,
      "grad_norm": 0.7614772196649142,
      "kl": 0.2451171875,
      "learning_rate": 4.785409789008988e-09,
      "loss": -0.0019,
      "num_tokens": 45282753.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 679.65625,
      "completions/mean_terminated_length": 630.4642944335938,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 4.703703703703704,
      "grad_norm": 0.5811976227136649,
      "kl": 0.2579345703125,
      "learning_rate": 4.68855149745534e-09,
      "loss": -0.0108,
      "num_tokens": 45310642.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 769.4375,
      "completions/mean_terminated_length": 710.6923217773438,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 4.70679012345679,
      "grad_norm": 0.9731395835926854,
      "kl": 0.254638671875,
      "learning_rate": 4.592674196956914e-09,
      "loss": 0.0056,
      "num_tokens": 45341792.0,
      "reward": 0.0,
      "reward_std": 0.1462501883506775,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 771.25,
      "completions/mean_terminated_length": 724.4444580078125,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 4.709876543209877,
      "grad_norm": 2.1363373228569023,
      "kl": 0.2135009765625,
      "learning_rate": 4.497778270923374e-09,
      "loss": -0.1513,
      "num_tokens": 45373044.0,
      "reward": 0.0,
      "reward_std": 0.14937826991081238,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 792.53125,
      "completions/mean_terminated_length": 653.6500244140625,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.712962962962963,
      "grad_norm": 0.012083313832523389,
      "kl": 0.2393798828125,
      "learning_rate": 4.403864098839833e-09,
      "loss": 0.0002,
      "num_tokens": 45404565.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 735.84375,
      "completions/mean_terminated_length": 655.1599731445312,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 4.716049382716049,
      "grad_norm": 0.009532706765631062,
      "kl": 0.2425537109375,
      "learning_rate": 4.31093205626551e-09,
      "loss": 0.0002,
      "num_tokens": 45434516.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 784.9375,
      "completions/mean_terminated_length": 691.3912963867188,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 4.719135802469136,
      "grad_norm": 0.6964502391952809,
      "kl": 0.2406005859375,
      "learning_rate": 4.218982514832048e-09,
      "loss": -0.0033,
      "num_tokens": 45466806.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 800.21875,
      "completions/mean_terminated_length": 768.2500610351562,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 4.722222222222222,
      "grad_norm": 3.609378764365454,
      "kl": 0.2320556640625,
      "learning_rate": 4.128015842242122e-09,
      "loss": -0.1872,
      "num_tokens": 45499197.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1012.0,
      "completions/mean_length": 752.75,
      "completions/mean_terminated_length": 662.3333740234375,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 4.7253086419753085,
      "grad_norm": 2.032506028024498,
      "kl": 0.2476806640625,
      "learning_rate": 4.0380324022679935e-09,
      "loss": -0.1037,
      "num_tokens": 45529729.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.2004832774400711,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 807.53125,
      "completions/mean_terminated_length": 735.375,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 4.728395061728395,
      "grad_norm": 1.5668059653165416,
      "kl": 0.2254638671875,
      "learning_rate": 3.9490325547499316e-09,
      "loss": 0.1095,
      "num_tokens": 45562542.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.17221274971961975,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592105805873871,
      "step": 1532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 950.0,
      "completions/mean_length": 819.59375,
      "completions/mean_terminated_length": 679.7368774414062,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 4.731481481481482,
      "grad_norm": 1.5434780132034096,
      "kl": 0.275146484375,
      "learning_rate": 3.861016655594962e-09,
      "loss": -0.0888,
      "num_tokens": 45595733.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.17549976706504822,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 751.53125,
      "completions/mean_terminated_length": 701.0740966796875,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 4.734567901234568,
      "grad_norm": 0.6816593991016254,
      "kl": 0.279541015625,
      "learning_rate": 3.773985056775258e-09,
      "loss": 0.0017,
      "num_tokens": 45626302.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 786.9375,
      "completions/mean_terminated_length": 753.0714721679688,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 4.737654320987654,
      "grad_norm": 0.5020631044292254,
      "kl": 0.2392578125,
      "learning_rate": 3.68793810632681e-09,
      "loss": 0.0247,
      "num_tokens": 45657840.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1535
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 810.6875,
      "completions/mean_terminated_length": 750.9599609375,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 4.7407407407407405,
      "grad_norm": 0.6664601360080478,
      "kl": NaN,
      "learning_rate": 3.602876148348116e-09,
      "loss": 0.023,
      "num_tokens": 45690882.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 822.15625,
      "completions/mean_terminated_length": 730.4091186523438,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 4.743827160493828,
      "grad_norm": 1.017677491573792,
      "kl": 0.242431640625,
      "learning_rate": 3.518799522998661e-09,
      "loss": 0.0212,
      "num_tokens": 45723983.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.13733375072479248,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 787.28125,
      "completions/mean_terminated_length": 732.6538696289062,
      "completions/min_length": 465.0,
      "completions/min_terminated_length": 465.0,
      "epoch": 4.746913580246914,
      "grad_norm": 0.49859467161351095,
      "kl": 0.2479248046875,
      "learning_rate": 3.435708566497608e-09,
      "loss": 0.021,
      "num_tokens": 45755436.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 720.875,
      "completions/mean_terminated_length": 689.5172119140625,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 4.75,
      "grad_norm": 1.004742760572375,
      "kl": 0.2672119140625,
      "learning_rate": 3.353603611122524e-09,
      "loss": 0.0252,
      "num_tokens": 45784804.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 803.90625,
      "completions/mean_terminated_length": 717.7825927734375,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 4.753086419753086,
      "grad_norm": 0.5757932819249877,
      "kl": 0.270751953125,
      "learning_rate": 3.2724849852079628e-09,
      "loss": -0.0036,
      "num_tokens": 45817269.0,
      "reward": 0.0,
      "reward_std": 0.09185586124658585,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 742.4375,
      "completions/mean_terminated_length": 702.2142944335938,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 4.756172839506172,
      "grad_norm": 1.8105014997801236,
      "kl": 0.246337890625,
      "learning_rate": 3.192353013144189e-09,
      "loss": 0.0651,
      "num_tokens": 45847699.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.18843232095241547,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 754.59375,
      "completions/mean_terminated_length": 692.423095703125,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 4.7592592592592595,
      "grad_norm": 1.6201573856275062,
      "kl": 0.225341796875,
      "learning_rate": 3.113208015375901e-09,
      "loss": -0.0074,
      "num_tokens": 45878718.0,
      "reward": 0.0,
      "reward_std": 0.20674464106559753,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 746.78125,
      "completions/mean_terminated_length": 728.300048828125,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 4.762345679012346,
      "grad_norm": 1.1305309049887073,
      "kl": 0.2535400390625,
      "learning_rate": 3.0350503084008995e-09,
      "loss": 0.0225,
      "num_tokens": 45909379.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.1590670943260193,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 738.34375,
      "completions/mean_terminated_length": 719.300048828125,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 4.765432098765432,
      "grad_norm": 1.210494311317426,
      "kl": 0.20025634765625,
      "learning_rate": 2.957880204768809e-09,
      "loss": 0.0753,
      "num_tokens": 45939098.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.1588881015777588,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 716.6875,
      "completions/mean_terminated_length": 672.7857666015625,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 4.768518518518518,
      "grad_norm": 0.8778223386458659,
      "kl": 0.2440185546875,
      "learning_rate": 2.8816980130799418e-09,
      "loss": 0.0057,
      "num_tokens": 45968568.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 948.0,
      "completions/mean_length": 750.03125,
      "completions/mean_terminated_length": 686.8077392578125,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 4.771604938271605,
      "grad_norm": 1.5742560739560612,
      "kl": 0.25,
      "learning_rate": 2.806504037983992e-09,
      "loss": -0.0526,
      "num_tokens": 45999521.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1904844343662262,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 760.78125,
      "completions/mean_terminated_length": 700.0385131835938,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 4.7746913580246915,
      "grad_norm": 1.1050697029317205,
      "kl": 0.2333984375,
      "learning_rate": 2.7322985801787046e-09,
      "loss": 0.0259,
      "num_tokens": 46030090.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.18398132920265198,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 796.1875,
      "completions/mean_terminated_length": 676.857177734375,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 4.777777777777778,
      "grad_norm": 0.606037516455463,
      "kl": 0.243408203125,
      "learning_rate": 2.6590819364088746e-09,
      "loss": 0.0071,
      "num_tokens": 46062232.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 744.71875,
      "completions/mean_terminated_length": 651.625,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 4.780864197530864,
      "grad_norm": 1.117436020312828,
      "kl": 0.2637939453125,
      "learning_rate": 2.5868543994650993e-09,
      "loss": -0.0237,
      "num_tokens": 46092539.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 708.1875,
      "completions/mean_terminated_length": 675.5172119140625,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 4.783950617283951,
      "grad_norm": 1.078584452776903,
      "kl": 0.22119140625,
      "learning_rate": 2.5156162581824736e-09,
      "loss": 0.0017,
      "num_tokens": 46121725.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 706.5625,
      "completions/mean_terminated_length": 661.2142944335938,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.787037037037037,
      "grad_norm": 1.0119835354365785,
      "kl": 0.258056640625,
      "learning_rate": 2.44536779743959e-09,
      "loss": 0.0094,
      "num_tokens": 46150779.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 799.90625,
      "completions/mean_terminated_length": 737.1599731445312,
      "completions/min_length": 535.0,
      "completions/min_terminated_length": 535.0,
      "epoch": 4.790123456790123,
      "grad_norm": 0.0072914569756307,
      "kl": 0.23046875,
      "learning_rate": 2.376109298157347e-09,
      "loss": 0.0002,
      "num_tokens": 46183192.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 771.875,
      "completions/mean_terminated_length": 713.6923217773438,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 4.79320987654321,
      "grad_norm": 1.3457583086868568,
      "kl": 0.2255859375,
      "learning_rate": 2.3078410372978084e-09,
      "loss": 0.0015,
      "num_tokens": 46214012.0,
      "reward": 0.0,
      "reward_std": 0.15882496535778046,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 819.65625,
      "completions/mean_terminated_length": 712.6190795898438,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 4.796296296296296,
      "grad_norm": 1.2188384068693174,
      "kl": 0.234130859375,
      "learning_rate": 2.240563287863151e-09,
      "loss": -0.0634,
      "num_tokens": 46246933.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 771.21875,
      "completions/mean_terminated_length": 735.107177734375,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 4.799382716049383,
      "grad_norm": 0.9464528233708589,
      "kl": 0.2259521484375,
      "learning_rate": 2.174276318894497e-09,
      "loss": -0.0379,
      "num_tokens": 46278204.0,
      "reward": 0.0,
      "reward_std": 0.1379297524690628,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 712.15625,
      "completions/mean_terminated_length": 654.4074096679688,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 4.802469135802469,
      "grad_norm": 1.393970967244665,
      "kl": 0.2574462890625,
      "learning_rate": 2.1089803954708884e-09,
      "loss": 0.0402,
      "num_tokens": 46307229.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.18818417191505432,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 742.90625,
      "completions/mean_terminated_length": 702.7500610351562,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 4.805555555555555,
      "grad_norm": 0.7378650056322856,
      "kl": 0.2532958984375,
      "learning_rate": 2.0446757787082324e-09,
      "loss": -0.0256,
      "num_tokens": 46337242.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.035921063274145126,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 737.28125,
      "completions/mean_terminated_length": 707.6206665039062,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 4.8086419753086425,
      "grad_norm": 0.9522607155052633,
      "kl": 0.255126953125,
      "learning_rate": 1.98136272575819e-09,
      "loss": -0.0401,
      "num_tokens": 46367175.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1001.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 674.71875,
      "completions/mean_terminated_length": 674.71875,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.811728395061729,
      "grad_norm": 1.9686056044058784,
      "kl": 0.251953125,
      "learning_rate": 1.919041489807233e-09,
      "loss": -0.1019,
      "num_tokens": 46395134.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 803.1875,
      "completions/mean_terminated_length": 741.3599853515625,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 4.814814814814815,
      "grad_norm": 0.5787964598445375,
      "kl": 0.2301025390625,
      "learning_rate": 1.857712320075616e-09,
      "loss": -0.0004,
      "num_tokens": 46427420.0,
      "reward": -9.313225746154785e-10,
      "reward_std": 0.03592105954885483,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 759.3125,
      "completions/mean_terminated_length": 721.5000610351562,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 4.817901234567901,
      "grad_norm": 0.008318506868947691,
      "kl": 0.2283935546875,
      "learning_rate": 1.7973754618162972e-09,
      "loss": 0.0002,
      "num_tokens": 46458254.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 710.0,
      "completions/mean_terminated_length": 689.0667114257812,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 4.820987654320987,
      "grad_norm": 0.9935498356911902,
      "kl": 0.2708740234375,
      "learning_rate": 1.7380311563140737e-09,
      "loss": 0.0279,
      "num_tokens": 46486778.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.13203482329845428,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 858.0,
      "completions/mean_length": 725.5,
      "completions/mean_terminated_length": 682.857177734375,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 4.824074074074074,
      "grad_norm": 1.0664536050126223,
      "kl": 0.260986328125,
      "learning_rate": 1.6796796408845292e-09,
      "loss": -0.0221,
      "num_tokens": 46516430.0,
      "reward": 0.0,
      "reward_std": 0.14713431894779205,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 772.40625,
      "completions/mean_terminated_length": 725.8148193359375,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 4.827160493827161,
      "grad_norm": 1.2974184614299895,
      "kl": 0.2332763671875,
      "learning_rate": 1.622321148873146e-09,
      "loss": -0.0662,
      "num_tokens": 46547979.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 797.71875,
      "completions/mean_terminated_length": 722.2916870117188,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 4.830246913580247,
      "grad_norm": 0.011663174895283918,
      "kl": 0.2509765625,
      "learning_rate": 1.5659559096543318e-09,
      "loss": 0.0003,
      "num_tokens": 46580806.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 797.0,
      "completions/mean_terminated_length": 754.9629516601562,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 4.833333333333333,
      "grad_norm": 1.600415293230517,
      "kl": 0.234130859375,
      "learning_rate": 1.5105841486304783e-09,
      "loss": -0.0301,
      "num_tokens": 46613394.0,
      "reward": -1.6643753042444587e-10,
      "reward_std": 0.1449076235294342,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.8462742445990443e-10,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 744.15625,
      "completions/mean_terminated_length": 650.875,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 4.83641975308642,
      "grad_norm": 1.4874112922698708,
      "kl": 0.25146484375,
      "learning_rate": 1.456206087231182e-09,
      "loss": 0.0409,
      "num_tokens": 46644039.0,
      "reward": -3.4924596548080444e-10,
      "reward_std": 0.1406289041042328,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.6298145055770874e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 773.84375,
      "completions/mean_terminated_length": 727.5184936523438,
      "completions/min_length": 508.0,
      "completions/min_terminated_length": 508.0,
      "epoch": 4.839506172839506,
      "grad_norm": 1.2939224766821031,
      "kl": 0.2393798828125,
      "learning_rate": 1.4028219429121912e-09,
      "loss": -0.0968,
      "num_tokens": 46674846.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 804.59375,
      "completions/mean_terminated_length": 731.4583740234375,
      "completions/min_length": 531.0,
      "completions/min_terminated_length": 531.0,
      "epoch": 4.842592592592593,
      "grad_norm": 1.8806684615615579,
      "kl": 0.208984375,
      "learning_rate": 1.350431929154655e-09,
      "loss": 0.1084,
      "num_tokens": 46707317.0,
      "reward": 0.0,
      "reward_std": 0.13786308467388153,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 799.84375,
      "completions/mean_terminated_length": 697.95458984375,
      "completions/min_length": 476.0,
      "completions/min_terminated_length": 476.0,
      "epoch": 4.845679012345679,
      "grad_norm": 0.7110150230135566,
      "kl": 0.2266845703125,
      "learning_rate": 1.2990362554642087e-09,
      "loss": 0.0182,
      "num_tokens": 46739888.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 788.34375,
      "completions/mean_terminated_length": 744.7037353515625,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 4.848765432098766,
      "grad_norm": 0.8514734727466381,
      "kl": 0.2490234375,
      "learning_rate": 1.2486351273701678e-09,
      "loss": 0.0329,
      "num_tokens": 46771275.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 972.0,
      "completions/mean_length": 731.21875,
      "completions/mean_terminated_length": 649.239990234375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 4.851851851851852,
      "grad_norm": 1.3542612170669153,
      "kl": 0.247802734375,
      "learning_rate": 1.199228746424752e-09,
      "loss": 0.0584,
      "num_tokens": 46801066.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.15539078414440155,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 718.15625,
      "completions/mean_terminated_length": 661.5184936523438,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 4.854938271604938,
      "grad_norm": 1.4127554384969723,
      "kl": 0.2669677734375,
      "learning_rate": 1.1508173102021402e-09,
      "loss": -0.0285,
      "num_tokens": 46830431.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 720.625,
      "completions/mean_terminated_length": 677.2857666015625,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 4.8580246913580245,
      "grad_norm": 0.9196112215753206,
      "kl": 0.2386474609375,
      "learning_rate": 1.1034010122978332e-09,
      "loss": 0.0096,
      "num_tokens": 46859731.0,
      "reward": 0.0,
      "reward_std": 0.14854103326797485,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 764.8125,
      "completions/mean_terminated_length": 727.7857666015625,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 4.861111111111111,
      "grad_norm": 1.3968624974201325,
      "kl": 0.236083984375,
      "learning_rate": 1.0569800423277652e-09,
      "loss": 0.0415,
      "num_tokens": 46891285.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.15788862109184265,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110855221748352,
      "step": 1575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 786.4375,
      "completions/mean_terminated_length": 761.862060546875,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 4.864197530864198,
      "grad_norm": 1.9272550504602735,
      "kl": 0.2047119140625,
      "learning_rate": 1.0115545859276098e-09,
      "loss": 0.2195,
      "num_tokens": 46923767.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.16878576576709747,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 747.90625,
      "completions/mean_terminated_length": 655.875,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 4.867283950617284,
      "grad_norm": 1.1959292652167877,
      "kl": 0.271728515625,
      "learning_rate": 9.67124824752058e-10,
      "loss": -0.0131,
      "num_tokens": 46954216.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1254967600107193,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 872.0,
      "completions/max_terminated_length": 872.0,
      "completions/mean_length": 630.5,
      "completions/mean_terminated_length": 630.5,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 4.87037037037037,
      "grad_norm": 0.6496505531564528,
      "kl": 0.267578125,
      "learning_rate": 9.236909364739587e-10,
      "loss": 0.0128,
      "num_tokens": 46980476.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 714.09375,
      "completions/mean_terminated_length": 642.5769653320312,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 4.8734567901234565,
      "grad_norm": 0.8103005809067142,
      "kl": 0.2293701171875,
      "learning_rate": 8.812530947837904e-10,
      "loss": 0.005,
      "num_tokens": 47009999.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 722.1875,
      "completions/mean_terminated_length": 679.0714721679688,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "epoch": 4.8765432098765435,
      "grad_norm": 0.7007164334439001,
      "kl": 0.24365234375,
      "learning_rate": 8.39811469388857e-10,
      "loss": -0.0081,
      "num_tokens": 47039945.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 745.96875,
      "completions/mean_terminated_length": 727.433349609375,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 4.87962962962963,
      "grad_norm": 1.1087163703421545,
      "kl": 0.227783203125,
      "learning_rate": 7.99366226012621e-10,
      "loss": 0.0238,
      "num_tokens": 47070264.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.1315683275461197,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 743.78125,
      "completions/mean_terminated_length": 691.888916015625,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 4.882716049382716,
      "grad_norm": 0.8626702521618844,
      "kl": 0.2255859375,
      "learning_rate": 7.59917526394066e-10,
      "loss": 0.0123,
      "num_tokens": 47100733.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 810.5,
      "completions/mean_terminated_length": 750.719970703125,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 4.885802469135802,
      "grad_norm": 1.2176014332942944,
      "kl": 0.2484130859375,
      "learning_rate": 7.214655282870019e-10,
      "loss": 0.0664,
      "num_tokens": 47133577.0,
      "reward": 0.0,
      "reward_std": 0.1497526913881302,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 695.5,
      "completions/mean_terminated_length": 661.5172119140625,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 4.888888888888889,
      "grad_norm": 0.703182392089653,
      "kl": 0.2611083984375,
      "learning_rate": 6.840103854595103e-10,
      "loss": -0.0116,
      "num_tokens": 47162541.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1003.0,
      "completions/mean_length": 814.65625,
      "completions/mean_terminated_length": 689.0499877929688,
      "completions/min_length": 552.0,
      "completions/min_terminated_length": 552.0,
      "epoch": 4.8919753086419755,
      "grad_norm": 1.2877794159444154,
      "kl": 0.2601318359375,
      "learning_rate": 6.475522476932504e-10,
      "loss": -0.0381,
      "num_tokens": 47195418.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 768.0,
      "completions/mean_length": 722.0,
      "completions/mean_terminated_length": 621.3333740234375,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 4.895061728395062,
      "grad_norm": 0.007863565238588218,
      "kl": 0.247802734375,
      "learning_rate": 6.120912607829598e-10,
      "loss": 0.0002,
      "num_tokens": 47225034.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 788.25,
      "completions/mean_terminated_length": 709.6666870117188,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 4.898148148148148,
      "grad_norm": 1.6012108367231617,
      "kl": 0.2738037109375,
      "learning_rate": 5.776275665357045e-10,
      "loss": -0.003,
      "num_tokens": 47256506.0,
      "reward": 0.0,
      "reward_std": 0.19149985909461975,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 703.46875,
      "completions/mean_terminated_length": 682.1000366210938,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 4.901234567901234,
      "grad_norm": 2.8066887905064943,
      "kl": 0.244140625,
      "learning_rate": 5.441613027704905e-10,
      "loss": 0.1544,
      "num_tokens": 47285165.0,
      "reward": 0.0,
      "reward_std": 0.17153745889663696,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 773.9375,
      "completions/mean_terminated_length": 690.5833740234375,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 4.904320987654321,
      "grad_norm": 0.8189136246632134,
      "kl": 0.240478515625,
      "learning_rate": 5.116926033176261e-10,
      "loss": 0.0101,
      "num_tokens": 47315951.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 678.4375,
      "completions/mean_terminated_length": 655.4000244140625,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 4.907407407407407,
      "grad_norm": 0.934045545733305,
      "kl": 0.2373046875,
      "learning_rate": 4.802215980182212e-10,
      "loss": -0.02,
      "num_tokens": 47343765.0,
      "reward": 7.450580596923828e-09,
      "reward_std": 0.15339991450309753,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 736.90625,
      "completions/mean_terminated_length": 683.74072265625,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 4.910493827160494,
      "grad_norm": 1.7751233737098953,
      "kl": 0.2236328125,
      "learning_rate": 4.4974841272357734e-10,
      "loss": 0.0371,
      "num_tokens": 47374326.0,
      "reward": -1.3969838619232178e-09,
      "reward_std": 0.1641397774219513,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 696.0,
      "completions/mean_terminated_length": 662.0689697265625,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 4.91358024691358,
      "grad_norm": 2.528655484875274,
      "kl": 0.2281494140625,
      "learning_rate": 4.2027316929479916e-10,
      "loss": 0.1401,
      "num_tokens": 47402598.0,
      "reward": -3.725290298461914e-09,
      "reward_std": 0.22804805636405945,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1592
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 735.375,
      "completions/mean_terminated_length": 681.9259033203125,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 4.916666666666667,
      "grad_norm": 0.006457239826496087,
      "kl": NaN,
      "learning_rate": 3.917959856022668e-10,
      "loss": 0.0002,
      "num_tokens": 47432426.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1593
    },
    {
      "clip_ratio/high_max": NaN,
      "clip_ratio/high_mean": NaN,
      "clip_ratio/low_mean": NaN,
      "clip_ratio/low_min": NaN,
      "clip_ratio/region_mean": NaN,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 861.46875,
      "completions/mean_terminated_length": 750.26318359375,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 4.919753086419753,
      "grad_norm": 0.9350426960796004,
      "kl": NaN,
      "learning_rate": 3.6431697552510853e-10,
      "loss": -0.0278,
      "num_tokens": 47466845.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 787.25,
      "completions/mean_terminated_length": 762.7586059570312,
      "completions/min_length": 545.0,
      "completions/min_terminated_length": 545.0,
      "epoch": 4.922839506172839,
      "grad_norm": 0.8366877610922199,
      "kl": 0.2413330078125,
      "learning_rate": 3.3783624895086795e-10,
      "loss": 0.0106,
      "num_tokens": 47498905.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 747.78125,
      "completions/mean_terminated_length": 684.0385131835938,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 4.925925925925926,
      "grad_norm": 0.6858457371625005,
      "kl": 0.2178955078125,
      "learning_rate": 3.123539117749485e-10,
      "loss": 0.0136,
      "num_tokens": 47529498.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 700.8125,
      "completions/mean_terminated_length": 679.2667236328125,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 4.929012345679013,
      "grad_norm": 0.7573476255525109,
      "kl": 0.2576904296875,
      "learning_rate": 2.8787006590022535e-10,
      "loss": -0.0105,
      "num_tokens": 47558172.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 711.84375,
      "completions/mean_terminated_length": 639.8077392578125,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 4.932098765432099,
      "grad_norm": 1.0341470735788636,
      "kl": 0.2425537109375,
      "learning_rate": 2.6438480923665627e-10,
      "loss": 0.0008,
      "num_tokens": 47587251.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1005.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 728.0625,
      "completions/mean_terminated_length": 728.0625,
      "completions/min_length": 333.0,
      "completions/min_terminated_length": 333.0,
      "epoch": 4.935185185185185,
      "grad_norm": 0.9476703452669963,
      "kl": 0.2508544921875,
      "learning_rate": 2.418982357008936e-10,
      "loss": 0.0235,
      "num_tokens": 47617221.0,
      "reward": 0.0,
      "reward_std": 0.14281287789344788,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": -2.3283064365386963e-10,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 769.21875,
      "completions/mean_terminated_length": 710.423095703125,
      "completions/min_length": 525.0,
      "completions/min_terminated_length": 525.0,
      "epoch": 4.938271604938271,
      "grad_norm": 1.2376217489149188,
      "kl": 0.232666015625,
      "learning_rate": 2.2041043521586756e-10,
      "loss": 0.0108,
      "num_tokens": 47648400.0,
      "reward": 0.0,
      "reward_std": 0.15834946930408478,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 780.90625,
      "completions/mean_terminated_length": 724.8077392578125,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 4.9413580246913575,
      "grad_norm": 1.2657420931607952,
      "kl": 0.21630859375,
      "learning_rate": 1.999214937104532e-10,
      "loss": -0.0838,
      "num_tokens": 47680125.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.148421049118042,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 732.03125,
      "completions/mean_terminated_length": 712.5667114257812,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 4.944444444444445,
      "grad_norm": 0.5502229114422817,
      "kl": 0.25732421875,
      "learning_rate": 1.8043149311916529e-10,
      "loss": 0.0278,
      "num_tokens": 47709962.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 804.71875,
      "completions/mean_terminated_length": 689.857177734375,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 4.947530864197531,
      "grad_norm": 2.687994689418515,
      "kl": 0.2730712890625,
      "learning_rate": 1.6194051138176955e-10,
      "loss": 0.0248,
      "num_tokens": 47742369.0,
      "reward": 1.862645149230957e-09,
      "reward_std": 0.15876935422420502,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 725.53125,
      "completions/mean_terminated_length": 670.25927734375,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 4.950617283950617,
      "grad_norm": 1.5022934438874624,
      "kl": 0.3040771484375,
      "learning_rate": 1.444486224429775e-10,
      "loss": -0.0243,
      "num_tokens": 47771978.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.1789141744375229,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 777.78125,
      "completions/mean_terminated_length": 708.8399658203125,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 4.953703703703704,
      "grad_norm": 1.2135377538704588,
      "kl": 0.226806640625,
      "learning_rate": 1.2795589625216875e-10,
      "loss": -0.0884,
      "num_tokens": 47803923.0,
      "reward": 0.0,
      "reward_std": 0.1385057121515274,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 722.1875,
      "completions/mean_terminated_length": 679.0714721679688,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 4.95679012345679,
      "grad_norm": 1.0589730453922852,
      "kl": 0.24609375,
      "learning_rate": 1.1246239876316899e-10,
      "loss": -0.0525,
      "num_tokens": 47833509.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 749.40625,
      "completions/mean_terminated_length": 731.1000366210938,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 4.959876543209877,
      "grad_norm": 1.1596273741453609,
      "kl": 0.2667236328125,
      "learning_rate": 9.796819193383376e-11,
      "loss": 0.0125,
      "num_tokens": 47864114.0,
      "reward": 2.7939677238464355e-09,
      "reward_std": 0.16112922132015228,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 9.313225746154785e-10,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 777.65625,
      "completions/mean_terminated_length": 695.5416870117188,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 4.962962962962963,
      "grad_norm": 0.590046245932475,
      "kl": 0.229248046875,
      "learning_rate": 8.447333372593735e-11,
      "loss": -0.0017,
      "num_tokens": 47895803.0,
      "reward": 0.02812499925494194,
      "reward_std": 0.05624999850988388,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.03125,
      "rewards/logprob_reward/std": 0.1767766922712326,
      "step": 1608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 726.53125,
      "completions/mean_terminated_length": 695.7586059570312,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 4.966049382716049,
      "grad_norm": 1.441134207155461,
      "kl": 0.224365234375,
      "learning_rate": 7.197787810492295e-11,
      "loss": -0.0356,
      "num_tokens": 47925560.0,
      "reward": -7.450580596923828e-09,
      "reward_std": 0.20232830941677094,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": -3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4016096591949463,
      "step": 1609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 731.3125,
      "completions/mean_terminated_length": 701.0344848632812,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 4.969135802469136,
      "grad_norm": 1.4331554563402127,
      "kl": 0.232666015625,
      "learning_rate": 6.04818750396252e-11,
      "loss": 0.0416,
      "num_tokens": 47955610.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 785.4375,
      "completions/mean_terminated_length": 730.3846435546875,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 4.972222222222222,
      "grad_norm": 1.3903098779452725,
      "kl": 0.249267578125,
      "learning_rate": 4.9985370502131366e-11,
      "loss": 0.0078,
      "num_tokens": 47987544.0,
      "reward": -5.587935447692871e-09,
      "reward_std": 0.1821010708808899,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3110854923725128,
      "step": 1611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1008.0,
      "completions/mean_length": 755.46875,
      "completions/mean_terminated_length": 633.4091186523438,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 4.9753086419753085,
      "grad_norm": 0.949442702717208,
      "kl": 0.2928466796875,
      "learning_rate": 4.0488406467559245e-11,
      "loss": -0.005,
      "num_tokens": 48018491.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 751.75,
      "completions/mean_terminated_length": 723.586181640625,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 4.978395061728395,
      "grad_norm": 0.9170827216344197,
      "kl": 0.2598876953125,
      "learning_rate": 3.1991020913890723e-11,
      "loss": -0.042,
      "num_tokens": 48048823.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 725.0,
      "completions/mean_terminated_length": 705.0667114257812,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 4.981481481481482,
      "grad_norm": 3.9707542582075344,
      "kl": 0.2298583984375,
      "learning_rate": 2.449324782183293e-11,
      "loss": -0.2022,
      "num_tokens": 48079019.0,
      "reward": 0.0,
      "reward_std": 0.3038697838783264,
      "rewards/format_reward_func/mean": -1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 3.725290298461914e-09,
      "rewards/logprob_reward/std": 0.4399413466453552,
      "step": 1614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 749.375,
      "completions/mean_terminated_length": 720.9655151367188,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 4.984567901234568,
      "grad_norm": 1.1168254213397273,
      "kl": 0.235107421875,
      "learning_rate": 1.799511717470725e-11,
      "loss": 0.0139,
      "num_tokens": 48109531.0,
      "reward": -1.862645149230957e-09,
      "reward_std": 0.1596047431230545,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 4.656612873077393e-10,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 776.5,
      "completions/mean_terminated_length": 694.0,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 4.987654320987654,
      "grad_norm": 0.8447286813106216,
      "kl": 0.2164306640625,
      "learning_rate": 1.2496654958310537e-11,
      "loss": -0.0075,
      "num_tokens": 48140567.0,
      "reward": 9.313225746154785e-10,
      "reward_std": 0.05163978040218353,
      "rewards/format_reward_func/mean": 1.4901161193847656e-08,
      "rewards/format_reward_func/std": 1.0160009860992432,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.0,
      "step": 1616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 677.84375,
      "completions/mean_terminated_length": 654.7667236328125,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 4.9907407407407405,
      "grad_norm": 1.1225410716703794,
      "kl": 0.255615234375,
      "learning_rate": 7.997883160748563e-12,
      "loss": -0.0305,
      "num_tokens": 48168210.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 736.96875,
      "completions/mean_terminated_length": 695.9642944335938,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 4.993827160493828,
      "grad_norm": 0.7400796420431668,
      "kl": 0.2340087890625,
      "learning_rate": 4.4988197724360465e-12,
      "loss": -0.0066,
      "num_tokens": 48198485.0,
      "reward": 0.0,
      "reward_std": 0.11249999701976776,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.2540002465248108,
      "step": 1618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 832.0,
      "completions/mean_length": 696.125,
      "completions/mean_terminated_length": 662.2069091796875,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 4.996913580246914,
      "grad_norm": 1.1668935899083517,
      "kl": 0.2413330078125,
      "learning_rate": 1.9994787860133646e-12,
      "loss": -0.0613,
      "num_tokens": 48227069.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.17175260186195374,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 1.862645149230957e-09,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 693.125,
      "completions/mean_terminated_length": 671.0667114257812,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 5.0,
      "grad_norm": 1.4493082750300388,
      "kl": 0.256103515625,
      "learning_rate": 4.998701962355412e-13,
      "loss": -0.0149,
      "num_tokens": 48255557.0,
      "reward": 3.725290298461914e-09,
      "reward_std": 0.17160141468048096,
      "rewards/format_reward_func/mean": 0.0,
      "rewards/format_reward_func/std": 0.0,
      "rewards/logprob_reward/mean": 0.0,
      "rewards/logprob_reward/std": 0.3592106103897095,
      "step": 1620
    },
    {
      "epoch": 5.0,
      "step": 1620,
      "total_flos": 0.0,
      "train_loss": -0.025403620972905414,
      "train_runtime": 20225.882,
      "train_samples_per_second": 0.641,
      "train_steps_per_second": 0.08
    }
  ],
  "logging_steps": 1,
  "max_steps": 1620,
  "num_input_tokens_seen": 48255557,
  "num_train_epochs": 5,
  "save_steps": 25,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}